Fix data pipelines once and for all

2026-06-10 21:27:32 +01:00 · 2026-06-10 21:27:32 +01:00 · 4012e4e047
commit 4012e4e047
parent 08560476c5
46 changed files with 4508 additions and 855 deletions
--- a/Makefile.data
+++ b/Makefile.data
@ -48,7 +48,8 @@ NAPTAN          := $(DATA_DIR)/naptan.parquet
 BROADBAND       := $(DATA_DIR)/broadband.parquet
 CONSERVATION_AREAS := $(DATA_DIR)/conservation_areas.geojson
 LISTED_BUILDINGS := $(DATA_DIR)/listed_buildings.gpkg
-SCHOOL_PROX     := $(DATA_DIR)/school_proximity.parquet
+SCHOOL_CATCH    := $(DATA_DIR)/school_catchments.parquet
+LSOA_CHILDREN   := $(DATA_DIR)/lsoa_children.parquet
 RENTAL          := $(DATA_DIR)/rental_prices.parquet
 INSPIRE_DIR     := $(DATA_DIR)/inspire
 OA_BOUNDARIES   := $(DATA_DIR)/oa_boundaries.gpkg
@ -100,19 +101,19 @@ PC_BOUNDARIES_DEPS := pipeline/transform/postcode_boundaries/__main__.py \
    pipeline/transform/postcode_boundaries/voronoi.py
 CRIME_DOWNLOAD_DEPS := pipeline/download/crime.py
 INSPIRE_DOWNLOAD_DEPS := pipeline/download/inspire.py
-TRANSIT_DOWNLOAD_DEPS := pipeline/download/transit_network.py pipeline/download/transxchange2gtfs_shim.js
+TRANSIT_DOWNLOAD_DEPS := pipeline/download/transit_network.py
 MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_poi.py

 # ── Phony aliases ─────────────────────────────────────────────────────────────

 .PHONY: prepare merge tiles satellite-tiles satellite-highres-tiles overlay-tiles noise-overlay-tiles crime-hotspot-tiles tree-overlay-tiles property-border-tiles \
        download-arcgis download-price-paid download-deprivation download-ethnicity \
-        download-naptan download-pois download-grocery-retail-points download-ofsted download-gias download-broadband download-conservation-areas download-listed-buildings download-rental-prices \
+        download-naptan download-pois download-grocery-retail-points download-ofsted download-gias download-lsoa-children download-broadband download-conservation-areas download-listed-buildings download-rental-prices \
        download-postcodes download-noise download-inspire download-crime \
        download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-os-greenspace download-pbf download-fr-tow download-nfi download-ofs-register download-places download-median-age download-england-boundary download-rightmove-outcodes \
        download-map-assets \
        transform-pois transform-epc-pp transform-crime transform-poi-proximity \
-        transform-school-proximity transform-tree-density \
+        transform-school-catchments transform-tree-density \
        generate-postcode-boundaries generate-travel-times enrich-actual-listings

 prepare:                      $(PRICES_STAMP) download-places tiles satellite-tiles overlay-tiles  property-border-tiles tree-overlay-tiles crime-hotspot-tiles property-border-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX)
@ -139,6 +140,7 @@ download-pois:                $(POIS_RAW)
 download-grocery-retail-points: $(GROCERY_RETAIL_POINTS)
 download-ofsted:              $(OFSTED)
 download-gias:                $(GIAS)
+download-lsoa-children:       $(LSOA_CHILDREN)
 download-broadband:           $(BROADBAND)
 download-conservation-areas:  $(CONSERVATION_AREAS)
 download-listed-buildings:    $(LISTED_BUILDINGS)
@ -150,7 +152,7 @@ download-inspire:             $(INSPIRE_STAMP)
 download-oa-boundaries:       $(OA_BOUNDARIES)
 download-uprn-lookup:         $(UPRN_LOOKUP)
 download-transit-network:     $(TRANSIT_STAMP)
-	$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip
+	$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip
 download-greenspace:          $(GREENSPACE)
 download-os-greenspace:       $(OS_GREENSPACE)
 download-pbf:                 $(PBF)
@ -168,11 +170,11 @@ transform-pois:               $(POIS_FILTERED)
 transform-epc-pp:             $(EPC_PP)
 transform-crime:              $(CRIME)
 transform-poi-proximity:      $(POI_PROXIMITY)
-transform-school-proximity:   $(SCHOOL_PROX)
+transform-school-catchments: $(SCHOOL_CATCH)
 transform-tree-density:       $(TREE_DENSITY_PC)
 generate-postcode-boundaries: $(PC_BOUNDARIES_STAMP)

-$(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGIS) $(PC_BOUNDARIES_DEPS)
+$(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGIS) $(GREENSPACE) $(PC_BOUNDARIES_DEPS)
 	@rm -f $@
 	$(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip"
 	uv run python -m pipeline.transform.postcode_boundaries \
@ -180,6 +182,7 @@ $(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGI
 		--arcgis $(ARCGIS) \
 		--oa-boundaries $(OA_BOUNDARIES) \
 		--inspire $(INSPIRE_DIR) \
+		--greenspace $(GREENSPACE) \
 		--output $(PC_BOUNDARIES)
 	$(VALIDATE_OUTPUTS) --active-postcode-boundary-match "$(ARCGIS)::$(PC_BOUNDARIES)"
 	@touch $@
@ -273,6 +276,9 @@ $(OFSTED):
 $(GIAS): pipeline/download/gias.py
 	uv run python -m pipeline.download.gias --output $@

+$(LSOA_CHILDREN): pipeline/download/lsoa_children.py
+	uv run python -m pipeline.download.lsoa_children --output $@
+
 $(BROADBAND):
 	uv run python -m pipeline.download.broadband --output $@

@ -315,7 +321,7 @@ $(UPRN_LOOKUP):
 $(TRANSIT_STAMP): $(TRANSIT_DOWNLOAD_DEPS)
 	@rm -f $@
 	uv run python -m pipeline.download.transit_network --output $(TRANSIT_DIR)
-	$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip
+	$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip
 	@touch $@

 $(RENTAL): pipeline/download/rental_prices.py
@ -364,8 +370,8 @@ $(CRIME) $(CRIME_BY_YEAR) &: $(CRIME_STAMP) $(PC_BOUNDARIES_STAMP) pipeline/tran
 $(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DEPS)
 	uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@

-$(SCHOOL_PROX): $(OFSTED) $(ARCGIS) $(GIAS) pipeline/transform/school_proximity.py pipeline/utils/poi_counts.py
-	uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --gias $(GIAS) --output $@
+$(SCHOOL_CATCH): $(OFSTED) $(ARCGIS) $(GIAS) $(LSOA_CHILDREN) pipeline/transform/school_catchments.py pipeline/utils/poi_counts.py
+	uv run python -m pipeline.transform.school_catchments --ofsted $(OFSTED) --arcgis $(ARCGIS) --gias $(GIAS) --lsoa-children $(LSOA_CHILDREN) --output $@

 $(TREE_DENSITY_PC): $(FR_TOW) $(NFI) $(ARCGIS) $(TREE_DENSITY_DEPS)
 	uv run python -m pipeline.transform.tree_density \
@ -386,6 +392,7 @@ $(PC_BOUNDARIES):
 	@echo "    --arcgis $(ARCGIS) \\"
 	@echo "    --oa-boundaries $(OA_BOUNDARIES) \\"
 	@echo "    --inspire $(INSPIRE_DIR) \\"
+	@echo "    --greenspace $(GREENSPACE) \\"
 	@echo "    --output $@"
 	@echo ""
 	@exit 1
@ -393,7 +400,7 @@ $(PC_BOUNDARIES):
 # ── Final merge → postcode.parquet + properties.parquet ──────────────────────

 $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
-                $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) $(MERGE_DEPS)
+                $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_CATCH) $(BROADBAND) $(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) $(MERGE_DEPS)
 	@rm -f $@
 	uv run python -m pipeline.transform.merge \
 		--epc-pp $(EPC_PP) \
@ -403,7 +410,7 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
 		--ethnicity $(ETHNICITY) \
 		--crime $(CRIME) \
 		--noise $(NOISE) \
-		--school-proximity $(SCHOOL_PROX) \
+		--school-catchments $(SCHOOL_CATCH) \
 		--broadband $(BROADBAND) \
 		--conservation-areas $(CONSERVATION_AREAS) \
 		--listed-buildings $(LISTED_BUILDINGS) \
@ -433,7 +440,7 @@ $(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX) $(PRICE_ESTIMATE_DEPS) | $(PROPER

 $(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(EPC) \
                             $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
-                             $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) \
+                             $(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_CATCH) $(BROADBAND) \
                             $(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) \
                             $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) \
                             $(MERGE_DEPS) pipeline/utils/fuzzy_join.py
@ -445,7 +452,7 @@ $(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(EPC) \
 		--ethnicity $(ETHNICITY) \
 		--crime $(CRIME) \
 		--noise $(NOISE) \
-		--school-proximity $(SCHOOL_PROX) \
+		--school-catchments $(SCHOOL_CATCH) \
 		--broadband $(BROADBAND) \
 		--conservation-areas $(CONSERVATION_AREAS) \
 		--listed-buildings $(LISTED_BUILDINGS) \
--- a/analyses/school_catchment_model.ipynb
+++ b/analyses/school_catchment_model.ipynb
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -11,7 +11,7 @@ services:
        command: >
            bash -c "
              cargo install cargo-watch &&
-              cargo watch --poll -i logs/ -x 'run -- --properties /app/property-data4/properties.parquet --postcode-features /app/property-data4/postcode.parquet --pois /app/property-data4/filtered_uk_pois.parquet --places /app/property-data4/places.parquet --tiles /app/property-data4/uk.pmtiles --postcodes /app/property-data4/postcode_boundaries --travel-times /app/property-data4/travel-times --satellite-tiles /app/property-data4/satellite.pmtiles --satellite-highres-tiles /app/property-data4/satellite_highres.pmtiles --noise-overlay-tiles /app/property-data4/noise_lden_10m.pmtiles --crime-hotspot-tiles /app/property-data4/crime_hotspots.pmtiles --tree-overlay-tiles /app/property-data4/trees_outside_woodlands.pmtiles --property-border-tiles /app/property-data4/property_borders.pmtiles'
+              cargo watch --poll -i logs/ -x 'run -- --properties /app/property-data/properties.parquet --postcode-features /app/property-data/postcode.parquet --pois /app/property-data/filtered_uk_pois.parquet --places /app/property-data/places.parquet --tiles /app/property-data/uk.pmtiles --postcodes /app/property-data/postcode_boundaries --travel-times /app/property-data/travel-times --satellite-tiles /app/property-data/satellite.pmtiles --satellite-highres-tiles /app/property-data/satellite_highres.pmtiles --noise-overlay-tiles /app/property-data/noise_lden_10m.pmtiles --crime-hotspot-tiles /app/property-data/crime_hotspots.pmtiles --tree-overlay-tiles /app/property-data/trees_outside_woodlands.pmtiles --property-border-tiles /app/property-data/property_borders.pmtiles --actual-listings-path /app/finder/data/online_listings_buy_enriched.parquet --crime-by-year-path /app/property-data/crime_by_postcode_by_year.parquet'
            "
        ports:
            - "8001:8001"
--- a/frontend/public/assets/twemoji/1f68a.png
+++ b/frontend/public/assets/twemoji/1f68a.png
--- a/frontend/public/assets/twemoji/1fa7a.png
+++ b/frontend/public/assets/twemoji/1fa7a.png
--- a/frontend/public/sitemap.xml
+++ b/frontend/public/sitemap.xml
@ -70,4 +70,14 @@
    <changefreq>monthly</changefreq>
    <priority>0.6</priority>
  </url>
+  <url>
+    <loc>https://perfect-postcode.co.uk/terms</loc>
+    <changefreq>yearly</changefreq>
+    <priority>0.3</priority>
+  </url>
+  <url>
+    <loc>https://perfect-postcode.co.uk/privacy</loc>
+    <changefreq>yearly</changefreq>
+    <priority>0.3</priority>
+  </url>
 </urlset>
--- a/frontend/scripts/prerender.mjs
+++ b/frontend/scripts/prerender.mjs
@ -107,6 +107,20 @@ const ROUTES = [
    description:
      'Learn how Perfect Postcode treats saved searches, account data and property research workflows with privacy and security in mind.',
  },
+  {
+    path: '/terms',
+    output: 'terms/index.html',
+    title: 'Terms of Service | Perfect Postcode',
+    description:
+      'The terms that govern your use of Perfect Postcode, including lifetime access, acceptable use, data accuracy, payments and refunds.',
+  },
+  {
+    path: '/privacy',
+    output: 'privacy/index.html',
+    title: 'Privacy Policy | Perfect Postcode',
+    description:
+      'How Perfect Postcode collects, uses and protects your data: account details, payments, saved searches, AI queries, analytics and your UK GDPR rights.',
+  },
 ];

 const FAQ_SCHEMA_ITEMS = [
@ -325,11 +339,16 @@ async function prerender() {
    args: ['--no-sandbox', '--disable-setuid-sandbox'],
  });

-  try {
-    const baseIndexHtml = cleanBaseIndexHtml(readFileSync(INDEX_PATH, 'utf-8'));
+  // Every real page renders tens of kB; a few hundred chars means the SPA
+  // raced hydration and we captured a loading shell.
+  const MIN_HTML_CHARS = 1000;
+  const MAX_ATTEMPTS = 3;

-    for (const route of ROUTES) {
-      const page = await browser.newPage();
+  async function renderRoute(route) {
+      // A fresh context per attempt: pages otherwise share cache/storage, and a
+      // poisoned chunk-fetch in the shared cache makes a route fail every retry.
+      const context = await browser.createBrowserContext();
+      const page = await context.newPage();

      // Intercept API requests to prevent real fetches and retry loops.
      await page.setRequestInterception(true);
@ -374,6 +393,7 @@ async function prerender() {
        }
      });

+      try {
        await page.goto(`http://127.0.0.1:${port}${route.path}`, {
          waitUntil: 'networkidle0',
          timeout: 30000,
@ -402,8 +422,31 @@ async function prerender() {
        return root.innerHTML;
        });

-      if (!html || html.length < 100) {
-        throw new Error(`Prerender produced too little HTML for ${route.path}`);
+        if (!html || html.length < MIN_HTML_CHARS) {
+          throw new Error(
+            `Prerender produced too little HTML for ${route.path} (${html?.length ?? 0} chars)`
+          );
+        }
+
+        return html;
+      } finally {
+        await context.close().catch(() => {});
+      }
+  }
+
+  try {
+    const baseIndexHtml = cleanBaseIndexHtml(readFileSync(INDEX_PATH, 'utf-8'));
+
+    for (const route of ROUTES) {
+      let html = null;
+      for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt += 1) {
+        try {
+          html = await renderRoute(route);
+          break;
+        } catch (err) {
+          if (attempt === MAX_ATTEMPTS) throw err;
+          console.warn(`Retrying ${route.path} (attempt ${attempt} failed: ${err.message})`);
+        }
      }

      const updated = updateHead(baseIndexHtml, route).replace(
@ -418,7 +461,6 @@ async function prerender() {
      const outputPath = join(DIST_DIR, route.output);
      mkdirSync(dirname(outputPath), { recursive: true });
      writeFileSync(outputPath, updated);
-      await page.close();
      console.log(`Prerendered ${route.path} (${html.length} chars) into ${route.output}`);
    }
  } finally {
--- a/frontend/src/components/legal/LegalPage.tsx
+++ b/frontend/src/components/legal/LegalPage.tsx
@ -0,0 +1,68 @@
+import { useTranslation } from 'react-i18next';
+import { usePageMeta } from '../../hooks/usePageMeta';
+import Footer from '../ui/Footer';
+import { PRIVACY, TERMS, type LegalDoc } from './legal-content';
+
+export type LegalKind = 'terms' | 'privacy';
+
+const DOCS: Record<LegalKind, LegalDoc> = { terms: TERMS, privacy: PRIVACY };
+
+export default function LegalPage({ kind }: { kind: LegalKind }) {
+  const { t, i18n } = useTranslation();
+  const doc = DOCS[kind];
+  usePageMeta(`${doc.title} | Perfect Postcode`, doc.metaDescription);
+
+  const showEnglishNotice = !i18n.language?.toLowerCase().startsWith('en');
+
+  return (
+    <main className="flex-1 overflow-y-auto bg-warm-50 dark:bg-navy-950">
+      <div className="mx-auto max-w-3xl px-4 py-10 sm:py-14">
+        <h1 className="text-3xl font-bold text-navy-950 dark:text-warm-100">{doc.title}</h1>
+        <p className="mt-2 text-sm text-warm-500 dark:text-warm-400">
+          {t('legal.lastUpdated', { date: doc.lastUpdated })}
+        </p>
+        {showEnglishNotice && (
+          <p className="mt-2 text-sm italic text-warm-500 dark:text-warm-400">
+            {t('legal.englishOnly')}
+          </p>
+        )}
+
+        <div className="mt-6 space-y-4">
+          {doc.intro.map((paragraph) => (
+            <p key={paragraph} className="leading-relaxed text-warm-700 dark:text-warm-300">
+              {paragraph}
+            </p>
+          ))}
+        </div>
+
+        <div className="mt-8 space-y-8">
+          {doc.sections.map((section) => (
+            <section key={section.heading}>
+              <h2 className="text-lg font-semibold text-navy-950 dark:text-warm-100">
+                {section.heading}
+              </h2>
+              {section.paragraphs.map((paragraph) => (
+                <p
+                  key={paragraph}
+                  className="mt-2 leading-relaxed text-warm-700 dark:text-warm-300"
+                >
+                  {paragraph}
+                </p>
+              ))}
+              {section.bullets && (
+                <ul className="mt-2 list-disc space-y-1.5 pl-5 text-warm-700 dark:text-warm-300">
+                  {section.bullets.map((bullet) => (
+                    <li key={bullet} className="leading-relaxed">
+                      {bullet}
+                    </li>
+                  ))}
+                </ul>
+              )}
+            </section>
+          ))}
+        </div>
+      </div>
+      <Footer />
+    </main>
+  );
+}
--- a/frontend/src/components/legal/legal-content.ts
+++ b/frontend/src/components/legal/legal-content.ts
@ -0,0 +1,183 @@
+/**
+ * Legal documents are maintained in English only; the English text is the
+ * authoritative version (a localized notice says so on the page). Keeping
+ * legal copy out of the i18n catalogues avoids meaning drift in translation.
+ *
+ * TODO before launch: confirm the operator/legal-entity details below.
+ */
+
+export interface LegalSection {
+  heading: string;
+  paragraphs: string[];
+  bullets?: string[];
+}
+
+export interface LegalDoc {
+  title: string;
+  metaDescription: string;
+  lastUpdated: string;
+  intro: string[];
+  sections: LegalSection[];
+}
+
+export const SUPPORT_EMAIL = 'support@perfect-postcode.co.uk';
+
+export const TERMS: LegalDoc = {
+  title: 'Terms of Service',
+  metaDescription:
+    'The terms that govern your use of Perfect Postcode, including lifetime access, acceptable use, data accuracy, payments and refunds.',
+  lastUpdated: '10 June 2026',
+  intro: [
+    `These terms govern your use of perfect-postcode.co.uk ("Perfect Postcode", "the service", "we", "us"). By creating an account or purchasing access you agree to them. If you have any questions, contact ${SUPPORT_EMAIL}.`,
+  ],
+  sections: [
+    {
+      heading: '1. The service',
+      paragraphs: [
+        'Perfect Postcode is a research tool that combines public datasets about England — property transactions, energy certificates, schools, crime, noise, broadband, transport and more — on an interactive map, so you can shortlist areas that fit your needs before booking viewings.',
+        'We are not an estate agent, mortgage broker, surveyor or financial adviser, and the service does not provide financial, legal or investment advice.',
+      ],
+    },
+    {
+      heading: '2. Accounts',
+      paragraphs: [
+        'You need an account to use the service beyond the free demo area. Provide a valid email address, keep your credentials secure, and do not share your account. Accounts are for one person each.',
+        'We may suspend or close accounts that breach these terms, abuse the service, or attempt to circumvent access restrictions. If we close your account without cause, we will refund the price you paid.',
+      ],
+    },
+    {
+      heading: '3. Free demo and lifetime access',
+      paragraphs: [
+        'Free accounts can explore all features within the demo area (inner London). Lifetime access is a one-time payment that gives your account ongoing access to the paid map — every postcode, every filter — for as long as the service runs. It is not a subscription, and routine data updates are included.',
+        'Lifetime access is personal and non-transferable, and is for personal, non-commercial property research. If you would like to use Perfect Postcode commercially (for example in lettings, relocation or research services), contact us first.',
+      ],
+    },
+    {
+      heading: '4. Acceptable use',
+      paragraphs: ['You agree not to:'],
+      bullets: [
+        'scrape, crawl or bulk-download data outside the export tools we provide;',
+        'resell, republish or redistribute the data or substantial extracts of it;',
+        'probe, disrupt or place unreasonable load on the service;',
+        'use the AI search or other features to process content you have no right to submit.',
+      ],
+    },
+    {
+      heading: '5. Data accuracy',
+      paragraphs: [
+        'The maps and figures are built from public datasets (HM Land Registry, EPC register, ONS, Ofsted, DfT, police.uk and others) combined with modelling and estimation. Sources can be incomplete, out of date or wrong at the level of an individual property, and our estimates — including estimated current prices — are statistical indications, not valuations.',
+        'Always verify anything that matters in person and through professional advice (surveys, solicitors, mortgage advisers) before making offers or financial decisions. We provide the service "as is" and do not warrant that any figure is accurate, complete or current.',
+      ],
+    },
+    {
+      heading: '6. Payments and refunds',
+      paragraphs: [
+        'Payments are processed by Stripe; we never see or store your card details. Prices are shown in pounds sterling at checkout. Early-access pricing tiers can change as tiers fill; the price shown at the moment you pay is the price you get.',
+        `If Perfect Postcode is not for you, email ${SUPPORT_EMAIL} within 14 days of purchase and we will refund you in full.`,
+      ],
+    },
+    {
+      heading: '7. Third-party content',
+      paragraphs: [
+        'Street View imagery, listing-portal links and similar embedded content are provided by third parties and governed by their own terms. We are not responsible for their availability or accuracy.',
+      ],
+    },
+    {
+      heading: '8. Liability',
+      paragraphs: [
+        'To the extent permitted by law, we are not liable for decisions made in reliance on the data, for indirect or consequential losses, or for interruptions to the service; our total liability to you is limited to the amount you paid us. Nothing in these terms excludes liability that cannot legally be excluded, and nothing affects your statutory rights as a consumer.',
+      ],
+    },
+    {
+      heading: '9. Changes to the service or these terms',
+      paragraphs: [
+        'We are a small product that improves continuously; features and data sources may change. We may update these terms, and will note the date of the latest revision above. If a change is material we will flag it on the site or by email. Continued use after a change means you accept the updated terms.',
+      ],
+    },
+    {
+      heading: '10. Governing law and contact',
+      paragraphs: [
+        `These terms are governed by the law of England and Wales, and disputes are subject to the jurisdiction of the courts of England and Wales (consumers keep any mandatory protections of their country of residence). Questions and complaints: ${SUPPORT_EMAIL} — we typically respond within 24 hours.`,
+      ],
+    },
+  ],
+};
+
+export const PRIVACY: LegalDoc = {
+  title: 'Privacy Policy',
+  metaDescription:
+    'How Perfect Postcode collects, uses and protects your data: account details, payments, saved searches, AI queries, analytics and your UK GDPR rights.',
+  lastUpdated: '10 June 2026',
+  intro: [
+    `This policy explains what personal data Perfect Postcode ("we", "us") collects, why, and your rights over it. We handle personal data under UK data-protection law (UK GDPR and the Data Protection Act 2018). Contact: ${SUPPORT_EMAIL}.`,
+  ],
+  sections: [
+    {
+      heading: '1. What we collect',
+      paragraphs: [],
+      bullets: [
+        'Account data: your email address, a hashed password (or your Google account identifier if you sign in with Google), newsletter preference and access status.',
+        'Purchase records: what you bought and when. Payments are processed by Stripe; we never receive your card details.',
+        'Things you create: saved searches, shared links and their settings.',
+        'AI search queries: the text you type into the AI search is processed to generate filters and logged with your account so we can debug and improve the feature.',
+        'Usage data: which pages and features are used, collected as events for product analytics, and standard server logs (IP address, user agent) kept for security.',
+      ],
+    },
+    {
+      heading: '2. How we use it',
+      paragraphs: [],
+      bullets: [
+        'To provide and secure the service, including signing you in and remembering your saved work (performance of contract).',
+        'To process payments and keep the records tax law requires (legal obligation).',
+        'To answer support requests (performance of contract).',
+        'To send the newsletter, only if you opted in — every email includes an unsubscribe link (consent).',
+        'To understand how features are used and improve them, using aggregated analytics and logged AI queries (legitimate interests).',
+      ],
+    },
+    {
+      heading: '3. Who we share it with',
+      paragraphs: [
+        'We do not sell personal data. We use a small number of processors to run the service:',
+      ],
+      bullets: [
+        'Stripe — payment processing.',
+        'Google — sign-in (if you choose Google OAuth), embedded Maps/Street View imagery, and the Gemini API which processes the text of AI searches.',
+        'Hosting and infrastructure providers that run our servers and store backups.',
+      ],
+    },
+    {
+      heading: '4. International transfers',
+      paragraphs: [
+        'Some processors (such as Stripe and Google) process data outside the UK. Where that happens, transfers rely on UK adequacy decisions or standard contractual clauses.',
+      ],
+    },
+    {
+      heading: '5. Cookies and local storage',
+      paragraphs: [
+        'We do not use advertising cookies or third-party trackers. Your browser’s local storage holds your sign-in token and preferences (theme, language, tutorial progress, last map view). Embedded Google content (Street View, sign-in) may set its own cookies under Google’s policies.',
+      ],
+    },
+    {
+      heading: '6. Retention',
+      paragraphs: [
+        'Account data is kept while your account exists and deleted when you ask us to close it. Server logs are kept for a short period for security. Purchase records are kept for as long as tax law requires (typically six years).',
+      ],
+    },
+    {
+      heading: '7. Your rights',
+      paragraphs: [
+        `You can ask for a copy of your data, have it corrected or deleted, restrict or object to processing, and receive your data in a portable format. Email ${SUPPORT_EMAIL} and we will respond promptly. If you are unhappy with how we handle your data you can complain to the Information Commissioner’s Office (ico.org.uk).`,
+      ],
+    },
+    {
+      heading: '8. Children',
+      paragraphs: ['The service is aimed at home buyers and renters and is not directed at children under 16.'],
+    },
+    {
+      heading: '9. Changes to this policy',
+      paragraphs: [
+        'We will post any changes here and update the date at the top. Material changes will be flagged on the site or by email.',
+      ],
+    },
+  ],
+};
--- a/frontend/src/components/ui/Footer.tsx
+++ b/frontend/src/components/ui/Footer.tsx
@ -0,0 +1,79 @@
+import { useTranslation } from 'react-i18next';
+import { LogoIcon } from './icons/LogoIcon';
+
+const SUPPORT_EMAIL = 'support@perfect-postcode.co.uk';
+
+function FooterLink({ href, label }: { href: string; label: string }) {
+  return (
+    <li>
+      <a
+        href={href}
+        className="text-sm text-warm-500 hover:text-teal-600 dark:text-warm-400 dark:hover:text-teal-400 transition-colors"
+      >
+        {label}
+      </a>
+    </li>
+  );
+}
+
+export default function Footer() {
+  const { t } = useTranslation();
+  const year = new Date().getFullYear();
+
+  return (
+    <footer className="border-t border-warm-200 bg-warm-50 dark:border-warm-800 dark:bg-navy-950">
+      <div className="mx-auto max-w-6xl px-4 py-10">
+        <div className="grid gap-8 sm:grid-cols-2 md:grid-cols-4">
+          <div>
+            <a href="/" className="flex items-center gap-2 hover:opacity-80 transition-opacity">
+              <LogoIcon className="h-5 w-5 shrink-0 text-teal-500" />
+              <span className="text-base font-semibold text-navy-950 dark:text-teal-300">
+                {t('header.appName')}
+              </span>
+            </a>
+            <p className="mt-3 text-sm leading-relaxed text-warm-500 dark:text-warm-400">
+              {t('footer.tagline')}
+            </p>
+          </div>
+
+          <nav aria-label={t('footer.product')}>
+            <h2 className="text-xs font-semibold uppercase tracking-wide text-warm-400 dark:text-warm-500">
+              {t('footer.product')}
+            </h2>
+            <ul className="mt-3 space-y-2">
+              <FooterLink href="/dashboard" label={t('header.dashboard')} />
+              <FooterLink href="/pricing" label={t('header.pricing')} />
+              <FooterLink href="/learn" label={t('header.learn')} />
+            </ul>
+          </nav>
+
+          <nav aria-label={t('footer.resources')}>
+            <h2 className="text-xs font-semibold uppercase tracking-wide text-warm-400 dark:text-warm-500">
+              {t('footer.resources')}
+            </h2>
+            <ul className="mt-3 space-y-2">
+              <FooterLink href="/data-sources" label={t('footer.dataSources')} />
+              <FooterLink href="/methodology" label={t('footer.methodology')} />
+              <FooterLink href={`mailto:${SUPPORT_EMAIL}`} label={t('footer.contact')} />
+            </ul>
+          </nav>
+
+          <nav aria-label={t('footer.legal')}>
+            <h2 className="text-xs font-semibold uppercase tracking-wide text-warm-400 dark:text-warm-500">
+              {t('footer.legal')}
+            </h2>
+            <ul className="mt-3 space-y-2">
+              <FooterLink href="/terms" label={t('footer.terms')} />
+              <FooterLink href="/privacy" label={t('footer.privacy')} />
+            </ul>
+          </nav>
+        </div>
+
+        <div className="mt-10 flex flex-col gap-2 border-t border-warm-200 pt-6 text-xs text-warm-400 dark:border-warm-800 dark:text-warm-500 sm:flex-row sm:items-center sm:justify-between">
+          <p>{t('footer.copyright', { year })}</p>
+          <p>{t('footer.coverage')}</p>
+        </div>
+      </div>
+    </footer>
+  );
+}
--- a/frontend/src/hooks/useIsDarkTheme.ts
+++ b/frontend/src/hooks/useIsDarkTheme.ts
@ -0,0 +1,18 @@
+import { useEffect, useState } from 'react';
+
+/**
+ * Tracks whether dark mode is active by observing the html.dark class.
+ * Useful in components that don't receive the theme as a prop (showcase,
+ * pricing backdrop) but must keep canvas/map content in sync with it.
+ */
+export function useIsDarkTheme(): boolean {
+  const [isDark, setIsDark] = useState(() => document.documentElement.classList.contains('dark'));
+  useEffect(() => {
+    const observer = new MutationObserver(() =>
+      setIsDark(document.documentElement.classList.contains('dark'))
+    );
+    observer.observe(document.documentElement, { attributes: true, attributeFilter: ['class'] });
+    return () => observer.disconnect();
+  }, []);
+  return isDark;
+}
--- a/frontend/src/lib/fit-bounds.test.ts
+++ b/frontend/src/lib/fit-bounds.test.ts
@ -0,0 +1,35 @@
+import { describe, expect, it } from 'vitest';
+
+import { MAP_MIN_ZOOM } from './consts';
+import { boundsToCenterZoom } from './fit-bounds';
+
+describe('boundsToCenterZoom', () => {
+  it('centers on the middle of the box', () => {
+    const target = boundsToCenterZoom({ south: 51.4, north: 51.6, west: -0.3, east: 0.1 });
+    expect(target.lat).toBeCloseTo(51.5, 5);
+    expect(target.lng).toBeCloseTo(-0.1, 5);
+  });
+
+  it('zooms close for a small box and far out for a country-sized box', () => {
+    const street = boundsToCenterZoom({ south: 51.5, north: 51.51, west: -0.11, east: -0.1 });
+    const england = boundsToCenterZoom({ south: 50.0, north: 55.5, west: -5.7, east: 1.8 });
+    expect(street.zoom).toBeGreaterThan(england.zoom);
+    expect(england.zoom).toBeGreaterThanOrEqual(MAP_MIN_ZOOM);
+    // Greater London-ish box should land in a sensible city-scale zoom range
+    const london = boundsToCenterZoom({ south: 51.44, north: 51.59, west: -0.31, east: 0.05 });
+    expect(london.zoom).toBeGreaterThan(8);
+    expect(london.zoom).toBeLessThan(12);
+  });
+
+  it('caps zoom-in for degenerate (single point) boxes', () => {
+    const point = boundsToCenterZoom({ south: 51.5, north: 51.5, west: -0.1, east: -0.1 });
+    expect(point.zoom).toBeLessThanOrEqual(13);
+  });
+
+  it('tolerates swapped corners', () => {
+    const target = boundsToCenterZoom({ south: 51.6, north: 51.4, west: 0.1, east: -0.3 });
+    expect(target.lat).toBeCloseTo(51.5, 5);
+    expect(target.lng).toBeCloseTo(-0.1, 5);
+    expect(Number.isFinite(target.zoom)).toBe(true);
+  });
+});
--- a/frontend/src/lib/fit-bounds.ts
+++ b/frontend/src/lib/fit-bounds.ts
@ -0,0 +1,45 @@
+import { MAP_MIN_ZOOM } from './consts';
+
+export interface GeoBounds {
+  south: number;
+  west: number;
+  north: number;
+  east: number;
+}
+
+/**
+ * Nominal viewport used to derive a zoom from a bounding box. The map only
+ * exposes flyTo(lat, lng, zoom), so we approximate fitBounds; being half a
+ * zoom level off for unusual window sizes is fine for "show me the matches".
+ */
+const NOMINAL_VIEWPORT = { width: 1000, height: 700 };
+const TILE_SIZE = 512;
+/** Keep matches comfortably inside the viewport edges. */
+const ZOOM_PADDING = 0.4;
+const MAX_FIT_ZOOM = 13;
+
+function mercatorY(lat: number): number {
+  const rad = (lat * Math.PI) / 180;
+  return Math.log(Math.tan(Math.PI / 4 + rad / 2));
+}
+
+/** Convert a bounding box into a flyTo target that roughly fits it on screen. */
+export function boundsToCenterZoom(bounds: GeoBounds): { lat: number; lng: number; zoom: number } {
+  const south = Math.min(bounds.south, bounds.north);
+  const north = Math.max(bounds.south, bounds.north);
+  const west = Math.min(bounds.west, bounds.east);
+  const east = Math.max(bounds.west, bounds.east);
+
+  const lonSpan = Math.max(east - west, 1e-6);
+  const mercSpan = Math.max(mercatorY(north) - mercatorY(south), 1e-6);
+
+  const zoomX = Math.log2((NOMINAL_VIEWPORT.width * 360) / (TILE_SIZE * lonSpan));
+  const zoomY = Math.log2((NOMINAL_VIEWPORT.height * 2 * Math.PI) / (TILE_SIZE * mercSpan));
+  const zoom = Math.max(MAP_MIN_ZOOM, Math.min(MAX_FIT_ZOOM, Math.min(zoomX, zoomY) - ZOOM_PADDING));
+
+  return {
+    lat: (south + north) / 2,
+    lng: (west + east) / 2,
+    zoom,
+  };
+}
--- a/pipeline/check_school_cutoffs.py
+++ b/pipeline/check_school_cutoffs.py
@ -0,0 +1,297 @@
+"""Evaluate modelled school catchment radii against published cutoffs.
+
+Local authorities publish each school's "last distance offered" in their
+yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
+holds a scraped sample of those figures (see the collection notes in each
+file's ``source_url`` fields). This script matches them to the per-school
+radii emitted by ``pipeline.transform.school_catchments --schools-output``
+and reports how well the model reproduces reality, so the preference-bonus
+constants can be calibrated.
+
+Headline metrics use non-faith schools whose published cutoff was a binding
+distance. Faith schools are reported separately (their distance criterion
+applies within faith priority, so published figures aren't comparable), as
+are "all applicants offered" schools, where the model should ideally show no
+binding cutoff.
+"""
+
+import argparse
+import difflib
+import json
+import re
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+
+_NOISE_WORDS = re.compile(
+    r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
+)
+_NON_ALNUM = re.compile(r"[^a-z0-9 ]")
+_SCHOOL_WORDS = re.compile(
+    r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
+)
+
+
+def normalize_name(name: str, strip_school_words: bool = False) -> str:
+    s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
+    s = _NON_ALNUM.sub(" ", s)
+    s = _NOISE_WORDS.sub(" ", s)
+    if strip_school_words:
+        s = _SCHOOL_WORDS.sub(" ", s)
+    return " ".join(s.split())
+
+
+def normalize_la(la: str) -> str:
+    s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and "))
+    return " ".join(s.replace("city of", "").split())
+
+
+def load_ground_truth(directory: Path) -> pl.DataFrame:
+    rows = []
+    for path in sorted(directory.glob("cutoffs_*.json")):
+        for row in json.loads(path.read_text()):
+            rows.append(
+                {
+                    "school_name": row["school_name"],
+                    "la": row["la"],
+                    "phase": row["phase"],
+                    "entry_year": int(row.get("entry_year") or 0),
+                    "cutoff_km": (
+                        float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
+                    ),
+                    "all_offered": bool(row.get("all_offered", False)),
+                    "faith_school": bool(row.get("faith_school", False)),
+                    "school_postcode": row.get("school_postcode"),
+                    "source_url": row.get("source_url", ""),
+                }
+            )
+    if not rows:
+        raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
+    df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
+    print(f"Ground truth rows: {len(df)} from {directory}")
+    return df
+
+
+def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
+    """Attach GIAS URNs to ground-truth rows by postcode, then name."""
+    def stripped(name: str) -> str:
+        return normalize_name(name, strip_school_words=True)
+
+    gias = gias.with_columns(
+        pl.col("name")
+        .map_elements(normalize_name, return_dtype=pl.Utf8)
+        .alias("_name_norm"),
+        pl.col("name")
+        .map_elements(stripped, return_dtype=pl.Utf8)
+        .alias("_name_stripped"),
+        pl.col("local_authority")
+        .map_elements(normalize_la, return_dtype=pl.Utf8)
+        .alias("_la_norm"),
+        pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
+    )
+    truth = truth.with_columns(
+        pl.col("school_name")
+        .map_elements(normalize_name, return_dtype=pl.Utf8)
+        .alias("_name_norm"),
+        pl.col("school_name")
+        .map_elements(stripped, return_dtype=pl.Utf8)
+        .alias("_name_stripped"),
+        pl.col("la")
+        .map_elements(normalize_la, return_dtype=pl.Utf8)
+        .alias("_la_norm"),
+        pl.col("school_postcode")
+        .str.replace_all(" ", "")
+        .str.to_uppercase()
+        .alias("_pc"),
+    ).with_row_index("_row_id")
+
+    # 1. Exact postcode match (unique postcodes only — site-sharing schools
+    #    would mismatch phases otherwise; those fall through to name matching).
+    pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
+        subset="_pc", keep="none"
+    )
+    by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
+        pc_unique.select("_pc", "urn"), on="_pc", how="inner"
+    )
+    matched_ids = set(by_pc["_row_id"].to_list())
+
+    # 2. Exact normalized (name, LA) match, unique on both sides.
+    gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
+    remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
+    by_name = remaining.join(
+        gias_named.select("_name_norm", "_la_norm", "urn"),
+        on=["_name_norm", "_la_norm"],
+        how="inner",
+    )
+    matched_ids |= set(by_name["_row_id"].to_list())
+
+    # 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
+    #    Primary School"): match on names with school-type words stripped,
+    #    unique on both sides so site-sharing infant/junior pairs fall through.
+    gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
+        subset=["_name_stripped", "_la_norm"], keep="none"
+    )
+    remaining = truth.filter(
+        (~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
+    ).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
+    by_stripped = remaining.join(
+        gias_stripped.select("_name_stripped", "_la_norm", "urn"),
+        on=["_name_stripped", "_la_norm"],
+        how="inner",
+    )
+    matched_ids |= set(by_stripped["_row_id"].to_list())
+
+    # 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
+    remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
+    fuzzy_rows = []
+    gias_by_la: dict[str, pl.DataFrame] = {}
+    for row in remaining.iter_rows(named=True):
+        la = row["_la_norm"]
+        if la not in gias_by_la:
+            gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
+        candidates = gias_by_la[la]
+        if candidates.is_empty():
+            continue
+        scores = [
+            difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
+            for cand in candidates["_name_norm"].to_list()
+        ]
+        order = np.argsort(scores)[::-1]
+        if scores[order[0]] >= 0.87 and (
+            len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
+        ):
+            fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
+    by_fuzzy = (
+        pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
+        if fuzzy_rows
+        else None
+    )
+
+    parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
+    matched = pl.concat(
+        [p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
+    ).unique(subset="_row_id", keep="first")
+    print(
+        f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
+        f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
+        f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
+    )
+    return matched
+
+
+def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
+    joined = matched.join(radii, on=["urn", "phase"], how="inner")
+    print(f"Joined to modelled radii: {len(joined)} rows")
+
+    # Published figures occasionally include non-typical admits (a child who
+    # moved mid-process can print as hundreds of km); cap at distances a
+    # distance criterion can plausibly produce.
+    binding = joined.filter(
+        ~pl.col("all_offered")
+        & pl.col("cutoff_km").is_between(0.05, 20.0)
+    )
+
+    def report(df: pl.DataFrame, label: str) -> None:
+        if df.is_empty():
+            print(f"\n{label}: no rows")
+            return
+        truth_km = df["cutoff_km"].to_numpy()
+        model_km = df["radius_km"].to_numpy()
+        log_ratio = np.log2(model_km / truth_km)
+        within2 = float(np.mean(np.abs(log_ratio) <= 1))
+        rank = (
+            pl.DataFrame({"t": truth_km, "m": model_km})
+            .select(pl.corr("t", "m", method="spearman"))
+            .item()
+        )
+        print(
+            f"\n{label} (n={len(df)}):\n"
+            f"  median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
+            f"(x{2 ** np.median(log_ratio):.2f})\n"
+            f"  median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
+            f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
+            f"  within factor 2: {within2:.0%}\n"
+            f"  Spearman rank corr: {rank:.2f}"
+        )
+
+    for phase in ("primary", "secondary"):
+        report(
+            binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
+            f"BINDING, non-faith, {phase}",
+        )
+    report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")
+
+    offered = joined.filter(pl.col("all_offered"))
+    if not offered.is_empty():
+        unbound_share = float((~offered["filled"]).mean())
+        print(
+            f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
+            f"cutoff for {unbound_share:.0%}; median modelled radius "
+            f"{offered['radius_km'].median():.2f} km"
+        )
+    return binding
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Compare modelled catchment radii with published cutoffs"
+    )
+    parser.add_argument(
+        "--ground-truth-dir",
+        type=Path,
+        default=Path("property-data/ground_truth"),
+    )
+    parser.add_argument(
+        "--radii",
+        type=Path,
+        default=Path("property-data/school_catchment_radii.parquet"),
+        help="Per-school radii parquet from school_catchments --schools-output",
+    )
+    parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
+    parser.add_argument(
+        "--matched-out",
+        type=Path,
+        default=None,
+        help="Optional CSV of matched rows for inspection",
+    )
+    args = parser.parse_args()
+
+    truth = load_ground_truth(args.ground_truth_dir)
+    # One row per school+phase: keep the most recent entry year.
+    truth = (
+        truth.sort("entry_year", descending=True)
+        .unique(subset=["school_name", "la", "phase"], keep="first")
+    )
+    gias = pl.read_parquet(args.gias).select(
+        "urn", "name", "postcode", "local_authority", "religious_character"
+    )
+    radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")
+
+    matched = match_schools(truth, gias.drop("religious_character"))
+    # GIAS religious character is authoritative; the scraped name-based flag
+    # only covers rows that failed to match.
+    matched = matched.join(
+        gias.select("urn", "religious_character"), on="urn", how="left"
+    ).with_columns(
+        pl.when(pl.col("religious_character").is_not_null())
+        .then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
+        .otherwise(pl.col("faith_school"))
+        .alias("faith_school")
+    )
+    binding = evaluate(matched, radii)
+
+    if args.matched_out is not None:
+        out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
+            "_row_id", "_name_norm", "_la_norm", "_pc"
+        )
+        args.matched_out.parent.mkdir(parents=True, exist_ok=True)
+        out.write_csv(args.matched_out)
+        print(f"\nWrote matched rows to {args.matched_out}")
+
+    if binding.is_empty():
+        raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/lsoa_children.py
+++ b/pipeline/download/lsoa_children.py
@ -0,0 +1,93 @@
+"""Download Census 2021 children by five-year age band per LSOA.
+
+Source: NOMIS (ONS Census 2021 — TS007A dataset, age by five-year bands)
+License: Open Government Licence v3.0
+
+Used to estimate how many primary-age (4-10) and secondary-age (11-15)
+children live in each LSOA, which drives the school catchment model. Census
+bands don't align with school phases, so phase totals take fractional shares
+of the 0-4, 10-14 and 15-19 bands (one fifth per single year of age).
+"""
+
+import argparse
+from io import BytesIO
+from pathlib import Path
+
+import httpx
+import polars as pl
+
+# NOMIS API: Census 2021 TS007A (age, five-year bands) by LSOA 2021 (TYPE151).
+# c2021_age_19 codes: 1 = 0-4, 2 = 5-9, 3 = 10-14, 4 = 15-19.
+# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
+BASE_URL = (
+    "https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv"
+    "?date=latest&geography=TYPE151&measures=20100&c2021_age_19=1,2,3,4"
+    "&select=GEOGRAPHY_CODE,C2021_AGE_19,OBS_VALUE"
+)
+PAGE_SIZE = 25000
+
+AGE_BAND_COLUMNS = {
+    1: "aged_0_4",
+    2: "aged_5_9",
+    3: "aged_10_14",
+    4: "aged_15_19",
+}
+
+
+def download_and_convert(output_path: Path) -> None:
+    print("Downloading Census 2021 LSOA age bands from NOMIS...")
+    frames = []
+    offset = 0
+    while True:
+        url = f"{BASE_URL}&recordoffset={offset}"
+        response = httpx.get(url, follow_redirects=True, timeout=120)
+        response.raise_for_status()
+        if len(response.content) == 0:
+            break
+        chunk = pl.read_csv(BytesIO(response.content))
+        if chunk.height == 0:
+            break
+        frames.append(chunk)
+        print(f"  Fetched {chunk.height} rows (offset={offset})")
+        if chunk.height < PAGE_SIZE:
+            break
+        offset += PAGE_SIZE
+
+    df = pl.concat(frames)
+    print(f"Total rows: {df.height}")
+
+    result = (
+        df.rename({"GEOGRAPHY_CODE": "lsoa21"})
+        .pivot(on="C2021_AGE_19", index="lsoa21", values="OBS_VALUE")
+        .rename({str(code): name for code, name in AGE_BAND_COLUMNS.items()})
+        .with_columns(pl.col(name).cast(pl.UInt32) for name in AGE_BAND_COLUMNS.values())
+        .filter(pl.col("lsoa21").str.starts_with("E"))
+        .sort("lsoa21")
+    )
+
+    missing = [c for c in AGE_BAND_COLUMNS.values() if c not in result.columns]
+    if missing:
+        raise ValueError(f"NOMIS response missing age bands: {missing}")
+
+    print(f"England LSOAs: {result.height}")
+    for name in AGE_BAND_COLUMNS.values():
+        print(f"  {name}: total {result[name].sum():,}")
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    result.write_parquet(output_path, compression="zstd")
+    print(f"Saved to {output_path}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Download Census 2021 age bands (children) by LSOA"
+    )
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output parquet file path"
+    )
+    args = parser.parse_args()
+    download_and_convert(args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/download/naptan.py
+++ b/pipeline/download/naptan.py
@ -12,8 +12,18 @@ import polars as pl

 NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
 TUBE_STATION_CATEGORY = "Tube station"
+TRAM_METRO_CATEGORY = "Tram & Metro stop"
 TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01

+# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
+# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
+# LU stations outside Greater London such as Epping or Amersham), then "0"
+# (platform/entrance node) or "G" (station group node), then the system code.
+# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
+# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
+# WM Metro, Blackpool Tramway, heritage railways, ...).
+LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
+

 STOP_TYPES = {
    "AIR": "Airport",
@ -25,25 +35,110 @@ STOP_TYPES = {
    "RLY": "Rail station",
    "RSE": "Rail station",
    "BCT": "Bus stop",
+    # Bus/coach stations: BST is the station access-area node, BCS/BCQ are
+    # bays/stands within the station and BCE is a station entrance. NaPTAN maps
+    # very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
+    # so sparse that 20% of England showed the nearest bus station >100km away.
+    # Bays and entrances collapse to one POI per station via
+    # STATION_MERGE_CATEGORIES below.
+    "BST": "Bus station",
+    "BCS": "Bus station",
+    "BCQ": "Bus station",
    "BCE": "Bus station",
    "TXR": "Taxi rank",
-    "TMU": "Tube station",
-    "MET": "Tube station",
+    # Tram/Metro/Underground: TMU is an entrance node, MET the station access
+    # area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
+    # mark them as London Underground (ZZLU) are reclassified to "Tube station"
+    # after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
+    # Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
+    # "heritage" flag, so they remain in "Tram & Metro stop".
+    "TMU": TRAM_METRO_CATEGORY,
+    "MET": TRAM_METRO_CATEGORY,
 }

 # Stop types that are access/entrance nodes rather than the primary station or
-# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
-# with both a station node and entrances yields one POI at the station node.
-ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
+# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
+# station with both a station node and entrances yields one POI at the station
+# node.
+ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}

 # Categories whose entrances/variants are merged into a single station-level POI
 # by normalized name + area (like Tube stations), so an RLY node and its RSE
 # entrances collapse to one POI at the station node.
-STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
+STATION_MERGE_CATEGORIES = {
+    TRAM_METRO_CATEGORY,
+    TUBE_STATION_CATEGORY,
+    "Rail station",
+    "Ferry",
+    "Bus station",
+}


 OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]

+# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
+# "West Station Entrance", ...) are stripped from canonical names so a
+# station's individually-named entrance nodes collapse into the station.
+# A trailing run of filler words is only stripped when it contains at least
+# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
+_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
+_ENTRANCE_FILLER_WORDS = {
+    "north",
+    "south",
+    "east",
+    "west",
+    "ne",
+    "nw",
+    "se",
+    "sw",
+    "n",
+    "s",
+    "e",
+    "w",
+    "wt",
+    "main",
+    "side",
+    "no",
+    "station",
+    "stop",
+    "platform",
+}
+
+_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
+_ENTRANCE_FILLER_RE = (
+    r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
+    r"|platform|\d+)"
+)
+_ENTRANCE_SUFFIX_RE = (
+    rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
+    rf"\s+{_ENTRANCE_WORDS_RE}"
+    rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
+)
+
+# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
+# stripped so every bay of one station shares a canonical name. The designator
+# word must be followed by a short alphanumeric token, so place names ending in
+# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
+_BAY_WORDS = {"stand", "stance", "bay", "gate"}
+_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
+
+
+def _strip_entrance_suffix(words: list[str]) -> list[str]:
+    """Drop a trailing entrance designator (direction/number filler around an
+    entrance word) from a tokenized stop name; no-op when no entrance word."""
+    idx = len(words)
+    saw_entrance = False
+    while idx > 0:
+        word = words[idx - 1]
+        if word in _ENTRANCE_NAME_WORDS:
+            saw_entrance = True
+        elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
+            pass
+        else:
+            break
+        idx -= 1
+    return words[:idx] if saw_entrance else words
+

 def canonical_station_name(name: str | None) -> str:
    """Normalize station names so entrances/transport-mode variants collapse."""
@ -55,18 +150,24 @@ def canonical_station_name(name: str | None) -> str:
    normalized = re.sub(r"['’`]", "", normalized)
    normalized = normalized.replace("&", " and ")
    normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
-    words = normalized.split()
+    words = _strip_entrance_suffix(normalized.split())
+
+    if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
+        del words[-2:]

    suffixes = (
        ("underground", "station"),
        ("tube", "station"),
        ("dlr", "station"),
        ("metro", "station"),
+        ("metrolink", "station"),
+        ("metrolink", "stop"),
        ("tram", "stop"),
        ("rail", "station"),
        ("railway", "station"),
        ("station",),
        ("stop",),
+        ("metrolink",),
    )
    while True:
        suffix = next(
@ -88,11 +189,14 @@ def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
    expr = expr.str.replace_all(r"&", " and ")
    expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
    expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
+    expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
+    expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
    expr = expr.str.replace_all(
-        r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
+        r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
    )
-    expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
+    expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
    expr = expr.str.replace_all(r"\s+(station|stop)$", "")
+    expr = expr.str.replace_all(r"\s+metrolink$", "")
    return expr.str.strip_chars()


@ -140,6 +244,7 @@ class StationAccumulator:
    lat_sum: float
    lng_sum: float
    entrance: bool = False
+    is_lu: bool = False
    count: int = 1

    @property
@ -159,6 +264,7 @@ class StationAccumulator:
        self.lat_sum += float(row["lat"])
        self.lng_sum += float(row["lng"])
        self.count += 1
+        self.is_lu = self.is_lu or bool(row.get("is_lu"))

        name = str(row["name"] or "")
        entrance = bool(row.get("entrance"))
@ -169,6 +275,16 @@ class StationAccumulator:
            self.name = name
            self.entrance = entrance

+    @property
+    def output_category(self) -> str:
+        # A merged tram/metro station is a genuine Tube station when ANY of its
+        # constituent nodes carries a London Underground ATCO code. Checking
+        # the whole group (not just the winning node) matters because LU
+        # entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
+        if self.category == TRAM_METRO_CATEGORY and self.is_lu:
+            return TUBE_STATION_CATEGORY
+        return self.category
+

 def _station_from_row(row: dict[str, object]) -> StationAccumulator:
    return StationAccumulator(
@ -178,6 +294,7 @@ def _station_from_row(row: dict[str, object]) -> StationAccumulator:
        lat_sum=float(row["lat"]),
        lng_sum=float(row["lng"]),
        entrance=bool(row.get("entrance")),
+        is_lu=bool(row.get("is_lu")),
    )


@ -217,7 +334,7 @@ def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
        {
            "id": [station.id for station in selected],
            "name": [station.name for station in selected],
-            "category": [station.category for station in selected],
+            "category": [station.output_category for station in selected],
            "lat": [station.lat for station in selected],
            "lng": [station.lng for station in selected],
        }
@ -258,10 +375,12 @@ def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
 def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
    """Deduplicate NaPTAN stops, merging station/terminal entrances by area.

-    Tube, rail and ferry POIs are merged to one record per station by
-    normalized name + area, with the primary station/terminal node (e.g. RLY,
-    FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
-    by exact name+category+locality.
+    Tram/metro, rail, ferry and bus-station POIs are merged to one record per
+    station by normalized name + area, with the primary station/terminal node
+    (e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
+    BCE). Merged tram/metro stations with a London Underground ATCO code in
+    the group become "Tube station". Other stops are deduplicated by exact
+    name+category+locality.
    """
    station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
    other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
@ -274,6 +393,29 @@ def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
    ).select(OUTPUT_COLUMNS)


+def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
+    """Keep only active NaPTAN stops.
+
+    The NaPTAN export's Status column marks stops as active/inactive/pending;
+    without this filter closed stations ("(closed)", "not in use") ship as
+    live POIs. Rows with a null Status are kept (benefit of the doubt); a
+    missing column is tolerated so older extracts still load.
+    """
+    if "Status" not in df.columns:
+        print("WARNING: NaPTAN data has no Status column; keeping all stops")
+        return df
+
+    before = len(df)
+    df = df.filter(
+        pl.col("Status").is_null()
+        | pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
+    )
+    dropped = before - len(df)
+    if dropped:
+        print(f"Dropped {dropped:,} non-active stops (Status != active)")
+    return df
+
+
 def download_naptan(output: Path) -> None:
    output.parent.mkdir(parents=True, exist_ok=True)

@ -291,7 +433,8 @@ def download_naptan(output: Path) -> None:
        )
        .drop_nulls(subset=["Latitude", "Longitude"])
        .filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
-        .select(
+    )
+    df = filter_active_stops(df).select(
        pl.col("ATCOCode").alias("id"),
        pl.col("CommonName").alias("name"),
        pl.col("StopType").replace(STOP_TYPES).alias("category"),
@ -299,7 +442,10 @@ def download_naptan(output: Path) -> None:
        pl.col("Longitude").alias("lng"),
        pl.col("NptgLocalityCode").alias("locality"),
        pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
-        )
+        pl.col("ATCOCode")
+        .str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
+        .fill_null(False)
+        .alias("is_lu"),
    )

    before = len(df)
--- a/pipeline/download/os_greenspace.py
+++ b/pipeline/download/os_greenspace.py
@ -2,12 +2,15 @@

 Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
 access point locations (park entrances). Each access point is tagged with
-its parent site's function type (e.g. Public Park Or Garden). Sites without
-access points fall back to polygon centroids.
+its parent site's function type (e.g. Public Park Or Garden), the parent
+site id and the site's polygon centroid. Sites without access points fall
+back to polygon centroids.

 Using access points rather than polygon centroids gives much more accurate
 distance calculations — a property next to Hyde Park won't show 400m just
-because the centroid is in the middle of the park.
+because the centroid is in the middle of the park. The site id / centroid
+columns let downstream consumers (poi_proximity) collapse the frame back to
+one row per SITE for counting, so a park with 30 gates counts as one park.

 Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
 License: Open Government Licence v3.0
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:

 def _read_access_points(
    shp_path: Path, site_funcs: dict[str, str]
-) -> tuple[list[float], list[float], list[str]]:
-    """Read access points, tagging each with its parent site's function."""
+) -> tuple[list[float], list[float], list[str], list[str]]:
+    """Read access points, tagging each with its parent site's function and id."""
    reader = shp.Reader(str(shp_path), encoding="latin-1")
    field_names = [f[0] for f in reader.fields[1:]]

@ -80,6 +83,7 @@ def _read_access_points(
    lats: list[float] = []
    lngs: list[float] = []
    categories: list[str] = []
+    site_ids: list[str] = []
    skipped = 0
    error_skipped = 0

@ -107,6 +111,7 @@ def _read_access_points(
        lats.append(lat)
        lngs.append(lng)
        categories.append(func)
+        site_ids.append(str(site_id))

    if skipped:
        print(f"  Skipped {skipped:,} access points with unknown site ID")
@ -116,31 +121,26 @@ def _read_access_points(
            error_skipped,
        )

-    return lats, lngs, categories
+    return lats, lngs, categories, site_ids


-def _read_site_centroids(
-    shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
-) -> tuple[list[float], list[float], list[str]]:
-    """Read polygon centroids for sites that have no access points (fallback)."""
+def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
+    """Compute the WGS84 polygon centroid of every greenspace site.
+
+    Used both as the representative point for site-level counting and as the
+    location fallback for sites that have no access points.
+    """
    reader = shp.Reader(str(shp_path), encoding="latin-1")
    field_names = [f[0] for f in reader.fields[1:]]
    id_idx = _find_field(field_names, "id")
-    func_idx = _find_field(field_names, "funct")
-    if id_idx is None or func_idx is None:
-        return [], [], []
+    if id_idx is None:
+        return {}

-    lats: list[float] = []
-    lngs: list[float] = []
-    categories: list[str] = []
+    centroids: dict[str, tuple[float, float]] = {}
    error_skipped = 0

    for sr in reader.shapeRecords():
        site_id = sr.record[id_idx]
-        if site_id in covered_ids:
-            continue
-
-        func = sr.record[func_idx]
        try:
            geom = to_shapely(sr.shape.__geo_interface__)
            if geom.is_empty or not geom.is_valid:
@ -156,9 +156,7 @@ def _read_site_centroids(
            )
            continue

-        lats.append(lat)
-        lngs.append(lng)
-        categories.append(func)
+        centroids[str(site_id)] = (lat, lng)

    if error_skipped:
        logger.warning(
@ -166,7 +164,7 @@ def _read_site_centroids(
            error_skipped,
        )

-    return lats, lngs, categories
+    return centroids


 def download_greenspace(output: Path) -> None:
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:

        # Step 2: Read access points (primary — park entrances)
        print(f"Reading {access_shps[0].name}...")
-        ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
+        ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
+            access_shps[0], site_funcs
+        )
        print(f"  {len(ap_lats):,} access points loaded")

-        # Step 3: Fall back to centroids for sites without any access points
-        covered_ids = set()
-        reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
-        field_names = [f[0] for f in reader.fields[1:]]
-        ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
-        if ref_idx is not None:
-            for rec in reader.iterRecords():
-                covered_ids.add(rec[ref_idx])
+        # Step 3: Compute every site's centroid: the representative point for
+        # site-level counting, and the location fallback for sites without any
+        # access points.
+        print("Computing site centroids...")
+        centroids = _read_site_centroids(site_shps[0])
+        print(f"  {len(centroids):,} site centroids computed")

-        print("Adding centroids for sites without access points...")
-        fb_lats, fb_lngs, fb_cats = _read_site_centroids(
-            site_shps[0], site_funcs, covered_ids
-        )
+        covered_ids = set(ap_site_ids)
+        fb_lats: list[float] = []
+        fb_lngs: list[float] = []
+        fb_cats: list[str] = []
+        fb_site_ids: list[str] = []
+        for site_id, (lat, lng) in centroids.items():
+            if site_id in covered_ids:
+                continue
+            func = site_funcs.get(site_id)
+            if func is None:
+                continue
+            fb_lats.append(lat)
+            fb_lngs.append(lng)
+            fb_cats.append(func)
+            fb_site_ids.append(site_id)
        print(f"  {len(fb_lats):,} centroid fallbacks added")

    lats = ap_lats + fb_lats
    lngs = ap_lngs + fb_lngs
    categories = ap_cats + fb_cats
+    site_ids = ap_site_ids + fb_site_ids
+    site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
+    site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]

    df = pl.DataFrame(
        {
            "lat": np.array(lats, dtype=np.float64),
            "lng": np.array(lngs, dtype=np.float64),
            "category": categories,
+            "site_id": site_ids,
+            # Site polygon centroid (null when the centroid could not be
+            # computed): the representative point when collapsing to one row
+            # per site for counting.
+            "site_lat": pl.Series(site_lats, dtype=pl.Float64),
+            "site_lng": pl.Series(site_lngs, dtype=pl.Float64),
        }
    )

--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -641,7 +641,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
        match = _DLR_CODE_RE.search(atco_id)
        if not match:
            continue
-        if row["category"] not in {"Tube station", "Rail station"}:
+        if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
            continue

        code = match.group(1)
--- a/pipeline/download/test_naptan.py
+++ b/pipeline/download/test_naptan.py
@ -2,9 +2,12 @@ import polars as pl
 import pytest

 from pipeline.download.naptan import (
+    TRAM_METRO_CATEGORY,
+    TUBE_STATION_CATEGORY,
    canonical_station_name,
    canonical_station_name_expr,
    deduplicate_naptan,
+    filter_active_stops,
 )


@ -34,6 +37,127 @@ def test_canonical_station_name_expr_normalizes_transport_suffixes():
    assert [canonical_station_name(name) for name in names] == result


+def test_canonical_station_name_strips_entrance_suffixes():
+    # Real shipped NaPTAN entrance names that previously failed to merge with
+    # their station node (79 stray entrance POIs).
+    cases = {
+        "Weaste Metrolink Station North East Entrance": "weaste",
+        "Weaste Metrolink Station North Entrance No 2": "weaste",
+        "Whitefield Metrolink Station Main Entrance": "whitefield",
+        "Radcliffe Metrolink Station Entrance": "radcliffe",
+        "Stretford Metrolink Station Wt Platform Entrance": "stretford",
+        "Salford Quays Metrolink Station SW entrance": "salford quays",
+        "Bank Station Ent 2": "bank",
+        "Hainault": "hainault",
+        # The Metrolink MET node names collapse to the same key.
+        "Weaste (Manchester Metrolink)": "weaste",
+        # No entrance word: direction/filler words must NOT be stripped.
+        "Maze Hill North": "maze hill north",
+        "Bus Station Entrance": "bus",
+        # Bus-station bay/stand designators collapse to the station name…
+        "Tonypandy Bus Station Stand A3": "tonypandy bus",
+        "Caerphilly Interchange Stand 5": "caerphilly interchange",
+        "Stanley Bus Station Stand G": "stanley bus",
+        # …but a bare trailing "Bay" (place names) is untouched.
+        "Colwyn Bay": "colwyn bay",
+    }
+    for name, expected in cases.items():
+        assert canonical_station_name(name) == expected, name
+
+    df = pl.DataFrame({"name": list(cases.keys())})
+    expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
+    assert expr_result == list(cases.values())
+
+
+def test_filter_active_stops_drops_non_active():
+    df = pl.DataFrame(
+        {
+            "ATCOCode": ["a", "b", "c", "d"],
+            "Status": ["active", "inactive", None, "Pending"],
+        }
+    )
+
+    result = filter_active_stops(df)
+
+    # Active and unknown (null) statuses survive; inactive/pending are dropped.
+    assert result["ATCOCode"].to_list() == ["a", "c"]
+
+
+def test_filter_active_stops_tolerates_missing_status_column():
+    df = pl.DataFrame({"ATCOCode": ["a"]})
+
+    assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"]
+
+
+def test_deduplicate_naptan_splits_london_underground_from_tram_metro():
+    # MET station nodes plus TMU entrances, pre-categorised as the tram/metro
+    # family. The Hainault group contains a 940GZZLU station node, so the
+    # merged POI is a genuine "Tube station" even though its entrance carries a
+    # non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop".
+    df = pl.DataFrame(
+        {
+            "id": [
+                "940GZZLUHLT",
+                "490000095003",
+                "9400ZZMAWST",
+                "1800NFR2691",
+            ],
+            "name": [
+                "Hainault Underground Station",
+                "Hainault",
+                "Weaste (Manchester Metrolink)",
+                "Weaste Metrolink Station North West Entrance",
+            ],
+            "category": [TRAM_METRO_CATEGORY] * 4,
+            "lat": [51.6034, 51.6037, 53.4826, 53.4826],
+            "lng": [0.0933, 0.0931, -2.3087, -2.3086],
+            "locality": [None, None, None, None],
+            "entrance": [False, True, False, True],
+            "is_lu": [True, False, False, False],
+        }
+    )
+
+    result = deduplicate_naptan(df).sort("category")
+
+    assert len(result) == 2
+    assert result["category"].to_list() == [
+        TRAM_METRO_CATEGORY,
+        TUBE_STATION_CATEGORY,
+    ]
+    tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY)
+    # The station node (not the entrance) represents the merged POI.
+    assert tube["id"][0] == "940GZZLUHLT"
+    tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY)
+    assert tram["id"][0] == "9400ZZMAWST"
+
+
+def test_deduplicate_naptan_merges_bus_station_bays_and_entrances():
+    # BCS bays and a BCE entrance of one bus station collapse to a single POI
+    # represented by a non-entrance node; a different bus station in another
+    # area survives separately.
+    df = pl.DataFrame(
+        {
+            "id": ["bay-1", "bay-2", "ent-1", "other"],
+            "name": [
+                "Bury Interchange",
+                "Bury Interchange",
+                "Bury Interchange East Entrance",
+                "Rochdale Interchange",
+            ],
+            "category": ["Bus station"] * 4,
+            "lat": [53.5907, 53.5908, 53.5909, 53.6160],
+            "lng": [-2.2958, -2.2957, -2.2956, -2.1561],
+            "locality": ["BURY", "BURY", "BURY", "ROCHDALE"],
+            "entrance": [False, False, True, False],
+        }
+    )
+
+    result = deduplicate_naptan(df).sort("name")
+
+    assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"]
+    assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1"
+
+
 def test_deduplicate_naptan_merges_tube_station_variants_by_area():
    df = pl.DataFrame(
        {
--- a/pipeline/download/test_places.py
+++ b/pipeline/download/test_places.py
@ -86,7 +86,7 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
                "Bank",
            ],
            "category": [
-                "Tube station",
+                "Tram & Metro stop",
                "Tube station",
                "Rail station",
                "Bus stop",
--- a/pipeline/download/test_transit_network.py
+++ b/pipeline/download/test_transit_network.py
@ -1,11 +1,15 @@
 """Tests for transit_network GTFS processing."""

+import datetime as dt
 import zipfile
 from pathlib import Path

 import pytest

-from pipeline.download.transit_network import convert_high_freq_to_frequency_based
+from pipeline.download.transit_network import (
+    convert_high_freq_to_frequency_based,
+    validate_gtfs_feed,
+)


 def _write_gtfs(path: Path, *, stop_times: str) -> None:
@ -77,3 +81,162 @@ def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:

    with pytest.raises(RuntimeError, match="no first stops"):
        convert_high_freq_to_frequency_based(src, dst)
+
+
+# ── validate_gtfs_feed ────────────────────────────────────────────────────────
+
+TODAY = dt.date(2026, 6, 10)
+
+
+def _make_gtfs(
+    path: Path,
+    *,
+    calendar: str | None = (
+        "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
+        "start_date,end_date\n"
+        "S1,1,1,1,1,1,0,0,20260101,20271231\n"
+    ),
+    calendar_dates: str | None = None,
+    stops: str = (
+        "stop_id,stop_name,stop_lat,stop_lon\n"
+        "STOP_A,Bank,51.5133,-0.0886\n"
+        "STOP_B,Liverpool Street,51.5178,-0.0823\n"
+    ),
+    routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
+    trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
+    stop_times: str = (
+        "trip_id,stop_sequence,departure_time,stop_id\n"
+        "T1,0,06:00:00,STOP_A\n"
+        "T1,1,06:02:00,STOP_B\n"
+    ),
+) -> Path:
+    """Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
+    with zipfile.ZipFile(path, "w") as z:
+        if calendar is not None:
+            z.writestr("calendar.txt", calendar)
+        if calendar_dates is not None:
+            z.writestr("calendar_dates.txt", calendar_dates)
+        z.writestr("stops.txt", stops)
+        z.writestr("routes.txt", routes)
+        z.writestr("trips.txt", trips)
+        z.writestr("stop_times.txt", stop_times)
+    return path
+
+
+def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
+    feed = _make_gtfs(tmp_path / "feed.zip")
+    validate_gtfs_feed(feed, "test feed", today=TODAY)  # must not raise
+
+
+def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
+    """The 2010 TfL snapshot failure mode: all calendars ended years ago."""
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        calendar=(
+            "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
+            "start_date,end_date\n"
+            "S1,1,1,1,1,1,0,0,20091201,20101224\n"
+        ),
+    )
+    with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
+        validate_gtfs_feed(feed, "stale tfl", today=TODAY)
+
+
+def test_validate_gtfs_feed_calendar_starting_after_window_fails(
+    tmp_path: Path,
+) -> None:
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        calendar=(
+            "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
+            "start_date,end_date\n"
+            "S1,1,1,1,1,1,0,0,20270101,20271231\n"
+        ),
+    )
+    with pytest.raises(RuntimeError, match="no service active"):
+        validate_gtfs_feed(feed, "future feed", today=TODAY)
+
+
+def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
+    tmp_path: Path,
+) -> None:
+    """An expired calendar.txt passes if calendar_dates.txt adds service now."""
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        calendar=(
+            "service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
+            "start_date,end_date\n"
+            "S1,1,1,1,1,1,0,0,20091201,20101224\n"
+        ),
+        calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
+    )
+    validate_gtfs_feed(feed, "rescued feed", today=TODAY)  # must not raise
+
+
+def test_validate_gtfs_feed_removed_service_exception_does_not_count(
+    tmp_path: Path,
+) -> None:
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        calendar=None,
+        calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
+    )
+    with pytest.raises(RuntimeError, match="no service active"):
+        validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
+
+
+def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
+    """The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        stops=(
+            "stop_id,stop_name,stop_lat,stop_lon\n"
+            "STOP_A,Nowhere,0,0\n"
+            "STOP_B,Blank,,\n"
+        ),
+    )
+    with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
+        validate_gtfs_feed(feed, "coordless feed", today=TODAY)
+
+
+def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        stops=(
+            "stop_id,stop_name,stop_lat,stop_lon\n"
+            "STOP_A,New York,40.71,-74.0\n"
+            "STOP_B,Sydney,-33.87,151.21\n"
+        ),
+    )
+    with pytest.raises(RuntimeError, match="plausible UK coordinates"):
+        validate_gtfs_feed(feed, "abroad feed", today=TODAY)
+
+
+def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
+    """One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
+    rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
+    rows.append("STOP_BAD,Broken,0,0\n")
+    feed = _make_gtfs(
+        tmp_path / "feed.zip",
+        stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
+    )
+    validate_gtfs_feed(feed, "mostly good feed", today=TODAY)  # must not raise
+
+
+def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
+    feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
+    with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
+        validate_gtfs_feed(feed, "tripless feed", today=TODAY)
+
+
+def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
+    feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
+    with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
+        validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
+
+
+def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
+    bogus = tmp_path / "feed.zip"
+    bogus.write_text("not a zip")
+    with pytest.raises(RuntimeError, match="not a valid zip"):
+        validate_gtfs_feed(bogus, "bogus feed", today=TODAY)
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -2,24 +2,32 @@

 Downloads:
  - England OSM PBF from Geofabrik (~1.5GB)
-  - BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
-  - TfL TransXChange timetables → converted to GTFS
-  - National Rail CIF timetable → converted to GTFS (requires credentials)
+  - BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
+    plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
+  - National Rail CIF timetable → converted to GTFS (requires credentials;
+    includes the Elizabeth line, TOC "XR")

 Then processes for R5 compatibility:
  - Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
  - Converts high-frequency metro/tram services to frequency-based GTFS
-  - Converts TfL TransXChange to GTFS via transxchange2gtfs
  - Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
+  - Validates every produced GTFS zip (active calendar window, plausible UK
+    stop coordinates, non-empty routes/trips/stop_times)

-Requires: osmium-tool, Node.js (npx), Docker (for national rail)
+Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
+was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
+in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
+service. BODS covers all TfL modes that feed nominally provided.
+
+Requires: osmium-tool, Docker (for national rail)

 Output directory: property-data/transit/
-  raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
+  raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
 """

 import argparse
 import csv
+import datetime as dt
 import io
 import json
 import os
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
 # Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
 BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"

-# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
-TFL_TRANSXCHANGE_URL = (
-    "https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
-)
-
-# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
-NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
-
 # National Rail Open Data API
 NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
 NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"

 USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
-TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
+
+# GTFS validation: a feed must have service within this many days of the build
+# date, and at least this fraction of stops must have plausible UK coordinates.
+GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
+GTFS_MIN_VALID_STOP_FRACTION = 0.95
+UK_LAT_RANGE = (49.0, 61.0)
+UK_LON_RANGE = (-9.0, 2.5)


 def _download_http(
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
    print(f"  Saved to {dst}")


-def download_tfl_transxchange(raw_dir: Path) -> Path:
-    """Download TfL TransXChange timetable bundle."""
-    dest = raw_dir / "tfl_transxchange.zip"
-    if dest.exists():
-        print(f"TfL TransXChange already exists: {dest}")
-        return dest
-
-    print("Downloading TfL TransXChange timetables...")
-    _download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
-    return dest
+def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
+    """True if a GTFS file has at least one non-empty data row after the header."""
+    with z.open(filename) as f:
+        f.readline()  # header
+        for line in f:
+            if _parse_csv_line(line):
+                return True
+    return False


-def download_naptan() -> None:
-    """Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
-    dest = local_tmp_dir() / "Stops.csv"
-    if dest.exists():
-        print(f"NaPTAN Stops.csv already exists: {dest}")
-        return
+def _calendar_active_in_window(
+    z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
+) -> bool:
+    """True if calendar.txt/calendar_dates.txt have service in [start, end].

-    print("Downloading NaPTAN stops data...")
-    _download_http(NAPTAN_URL, dest, desc="Stops.csv")
-
-
-def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
-    """Convert TfL TransXChange to GTFS using transxchange2gtfs."""
-    dest = output_dir / "tfl_gtfs.zip"
-    if dest.exists():
-        print(f"TfL GTFS already exists: {dest}")
-        return dest
-
-    txc_path = raw_dir / "tfl_transxchange.zip"
-
-    # Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
-    download_naptan()
-
-    print("Converting TfL TransXChange → GTFS...")
-    # The shim patches known packaging/runtime issues in the pinned npm package
-    # before loading its CLI from npx's temporary install.
-    shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
-    subprocess.run(
-        [
-            "npx",
-            "--yes",
-            "--package",
-            TRANSXCHANGE2GTFS_PACKAGE,
-            "sh",
-            "-c",
-            "\n".join(
-                [
-                    'bin="$(command -v transxchange2gtfs)"',
-                    'script="$(readlink -f "$bin")"',
-                    'pkg_dir="$(dirname "$(dirname "$script")")"',
-                    'shim="$1"',
-                    "shift",
-                    'exec node "$shim" "$pkg_dir" "$@"',
-                ]
-            ),
-            "transxchange2gtfs",
-            str(shim_path.resolve()),
-            str(txc_path.resolve()),
-            str(dest.resolve()),
-        ],
-        check=True,
+    Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
+    date range overlaps the window AND at least one weekday flag is set; a
+    calendar_dates.txt row counts when it adds service (exception_type=1) on a
+    date inside the window.
+    """
+    weekdays = (
+        "monday",
+        "tuesday",
+        "wednesday",
+        "thursday",
+        "friday",
+        "saturday",
+        "sunday",
+    )
+    if "calendar.txt" in names:
+        with z.open("calendar.txt") as f:
+            cols = _parse_csv_line(f.readline())
+            try:
+                start_idx = cols.index("start_date")
+                end_idx = cols.index("end_date")
+            except ValueError:
+                return False
+            day_idxs = [cols.index(d) for d in weekdays if d in cols]
+            for line in f:
+                parts = _parse_csv_line(line)
+                if not parts:
+                    continue
+                try:
+                    start = int(parts[start_idx].strip('"'))
+                    end = int(parts[end_idx].strip('"'))
+                except (ValueError, IndexError):
+                    continue
+                if start > window_end or end < window_start:
+                    continue
+                if day_idxs and not any(
+                    parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
+                ):
+                    continue
+                return True
+
+    if "calendar_dates.txt" in names:
+        with z.open("calendar_dates.txt") as f:
+            cols = _parse_csv_line(f.readline())
+            try:
+                date_idx = cols.index("date")
+                exc_idx = cols.index("exception_type")
+            except ValueError:
+                return False
+            for line in f:
+                parts = _parse_csv_line(line)
+                if not parts:
+                    continue
+                try:
+                    date = int(parts[date_idx].strip('"'))
+                except (ValueError, IndexError):
+                    continue
+                if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
+                    continue
+                if window_start <= date <= window_end:
+                    return True
+
+    return False
+
+
+def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
+    """Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
+
+    Guards against silently shipping a feed that contributes zero service (as
+    the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
+      (a) calendar.txt/calendar_dates.txt have at least one service active
+          within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
+      (b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
+          have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
+      (c) routes.txt, trips.txt and stop_times.txt each have data rows.
+    """
+    if today is None:
+        today = dt.date.today()
+    window_start = int(today.strftime("%Y%m%d"))
+    window_end = int(
+        (today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
+    )
+
+    def fail(reason: str) -> None:
+        raise RuntimeError(
+            f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
+        )
+
+    print(f"Validating GTFS feed '{feed_name}'...")
+    if not path.exists() or not zipfile.is_zipfile(path):
+        fail("not a valid zip file")
+
+    with zipfile.ZipFile(path) as z:
+        names = set(z.namelist())
+
+        # (c) core files present and non-empty
+        for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
+            if required not in names:
+                fail(f"missing {required}")
+            if not _gtfs_has_data_row(z, required):
+                fail(f"{required} has no data rows")
+
+        # (a) at least one service active in the routing window
+        if "calendar.txt" not in names and "calendar_dates.txt" not in names:
+            fail("has neither calendar.txt nor calendar_dates.txt")
+        if not _calendar_active_in_window(z, names, window_start, window_end):
+            fail(
+                f"no service active between {window_start} and {window_end} — "
+                "the feed's calendars are stale/expired and it would contribute "
+                "zero service to routing"
+            )
+
+        # (b) stops have plausible UK coordinates
+        total_stops = 0
+        valid_stops = 0
+        with z.open("stops.txt") as f:
+            cols = _parse_csv_line(f.readline())
+            try:
+                lat_idx = cols.index("stop_lat")
+                lon_idx = cols.index("stop_lon")
+            except ValueError:
+                fail("stops.txt is missing stop_lat/stop_lon columns")
+            for line in f:
+                parts = _parse_csv_line(line)
+                if not parts:
+                    continue
+                total_stops += 1
+                try:
+                    lat = float(parts[lat_idx].strip('"'))
+                    lon = float(parts[lon_idx].strip('"'))
+                except (ValueError, IndexError):
+                    continue  # empty/garbage coordinate → invalid
+                if lat == 0.0 and lon == 0.0:
+                    continue
+                if (
+                    UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
+                    and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
+                ):
+                    valid_stops += 1
+        if total_stops == 0:
+            fail("stops.txt has no stops")
+        fraction = valid_stops / total_stops
+        if fraction < GTFS_MIN_VALID_STOP_FRACTION:
+            fail(
+                f"only {valid_stops}/{total_stops} stops "
+                f"({fraction:.1%}) have plausible UK coordinates "
+                f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
+                f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
+                f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
+            )
+
+    print(
+        f"  OK: service active in window, {valid_stops}/{total_stops} stops "
+        f"({fraction:.1%}) with plausible UK coordinates"
    )
-    required_files = {
-        "agency.txt",
-        "calendar.txt",
-        "calendar_dates.txt",
-        "routes.txt",
-        "stop_times.txt",
-        "stops.txt",
-        "trips.txt",
-    }
-    if not dest.exists() or not zipfile.is_zipfile(dest):
-        raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
-    with zipfile.ZipFile(dest) as z:
-        missing = required_files - set(z.namelist())
-    if missing:
-        missing_str = ", ".join(sorted(missing))
-        raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
-    size_mb = dest.stat().st_size / (1024 * 1024)
-    print(f"  Saved to {dest} ({size_mb:.1f} MB)")
-    return dest


 def download_national_rail_cif(raw_dir: Path) -> Path | None:
@ -1007,18 +1099,15 @@ def main() -> None:
        required=True,
        help="Output directory for transit data",
    )
-    parser.add_argument(
-        "--skip-tfl",
-        action="store_true",
-        help="Skip TfL TransXChange download and conversion",
-    )
    args = parser.parse_args()

    output_dir: Path = args.output
    raw_dir = output_dir / "raw"
    raw_dir.mkdir(parents=True, exist_ok=True)

-    # 1. Download, clean, and frequency-convert BODS GTFS
+    # 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
+    # England bus/tram/ferry plus London Underground, DLR, London Tramlink and
+    # the IFS Cloud Cable Car, so no separate TfL feed is needed.
    download_osm_pbf(raw_dir)
    bods_raw = download_bods_gtfs(raw_dir)

@ -1027,16 +1116,10 @@ def main() -> None:

    bods_final = output_dir / "bods_gtfs.zip"
    convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
+    validate_gtfs_feed(bods_final, "BODS GTFS")

-    # 2. TfL TransXChange → GTFS
-    if args.skip_tfl:
-        print("Skipping TfL (--skip-tfl)")
-    else:
-        download_tfl_transxchange(raw_dir)
-        convert_tfl_to_gtfs(raw_dir, output_dir)
-
-    # 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
-    # reach the ~2,725 railway-station destinations, so a bus/TfL-only network
+    # 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
+    # reach the ~2,725 railway-station destinations, so a bus/metro-only network
    # silently overstates every train commute. Missing credentials are a HARD
    # error, so a rail-less network can never ship.
    cif = download_national_rail_cif(raw_dir)
@ -1048,7 +1131,8 @@ def main() -> None:
            "required; without it the transit network models every train journey "
            "as bus-only and overstates commute times."
        )
-    convert_national_rail_to_gtfs(raw_dir, output_dir)
+    nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
+    validate_gtfs_feed(nr_final, "National Rail GTFS")

    # Summary
    print()
--- a/pipeline/download/transxchange2gtfs_shim.js
+++ b/pipeline/download/transxchange2gtfs_shim.js
@ -1,106 +0,0 @@
-#!/usr/bin/env node
-"use strict";
-
-const fs = require("fs");
-const path = require("path");
-const { createRequire } = require("module");
-
-const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
-
-if (!pkgDirArg || converterArgs.length < 2) {
-  console.error(
-    "Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
-  );
-  process.exit(2);
-}
-
-const pkgDir = path.resolve(pkgDirArg);
-const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
-const localTmpDir =
-  process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
-const stopsCsv = path.join(localTmpDir, "Stops.csv");
-const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
-const converterTmpPatch =
-  `static TMP = ${JSON.stringify(converterTmpPrefix)}` +
-  ` + process.pid + ${JSON.stringify(path.sep)};`;
-
-fs.mkdirSync(localTmpDir, { recursive: true });
-
-function replaceOnce(relativePath, before, after) {
-  const file = path.join(pkgDir, relativePath);
-  const original = fs.readFileSync(file, "utf8");
-  if (original.includes(before)) {
-    fs.writeFileSync(file, original.replace(before, after));
-  } else if (original.includes(after)) {
-    return;
-  } else {
-    throw new Error(`Could not patch ${relativePath}: expected text not found`);
-  }
-}
-
-// The published 1.12.0 package has a few compatibility issues with current
-// TfL TransXChange exports:
-// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
-// - the compiled date-holidays import expects a synthetic default export
-// - some TfL journeys reference timing links without matching route-link geometry
-//
-// GTFS shapes are optional for R5 routing. Clear shape references and omit
-// shapes.txt so missing route geometry does not drop otherwise usable trips.
-function patchPackage() {
-  replaceOnce(
-    "dist/Container.js",
-    "static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
-    converterTmpPatch,
-  );
-  replaceOnce(
-    "dist/Container.js",
-    'fs.existsSync("/tmp/Stops.csv")',
-    `fs.existsSync(${JSON.stringify(stopsCsv)})`,
-  );
-  replaceOnce(
-    "dist/Container.js",
-    'fs.createReadStream("/tmp/Stops.csv", "utf8")',
-    `fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
-  );
-  replaceOnce(
-    "dist/converter/GetStopData.js",
-    'fs.createWriteStream("/tmp/Stops.csv")',
-    `fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
-  );
-  replaceOnce(
-    "dist/transxchange/TransXChangeJourneyStream.js",
-    "distanceSoFarM += routeLink.Distance;",
-    "distanceSoFarM += routeLink ? routeLink.Distance : 0;",
-  );
-  replaceOnce(
-    "dist/gtfs/TripsStream.js",
-    "(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
-    "\"\");",
-  );
-  replaceOnce(
-    "dist/gtfs/StopTimesStream.js",
-    "stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
-    "\"\", stop.exactTime ? \"1\" : \"0\");",
-  );
-  replaceOnce(
-    "dist/Container.js",
-    "\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n            \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
-    "\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
-  );
-  replaceOnce(
-    "dist/Container.js",
-    "\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n            \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n            \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
-    "\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n            \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
-  );
-}
-
-patchPackage();
-
-const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
-const Holidays = pkgRequire("date-holidays");
-if (!Holidays.default) {
-  Holidays.default = Holidays;
-}
-
-process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
-require(path.join(pkgDir, "dist", "cli.js"));
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
    return _clean_string(column).cast(dtype, strict=False)


+def _join_address_parts(*columns: str) -> pl.Expr:
+    """Join address components into one display address, single-spaced.
+
+    Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent —
+    saon is "" on ~88% of rows — and ``concat_str(..., ignore_nulls=True)``
+    skips only nulls, so empty components still contributed their separator
+    (``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
+    Convert ``''``→null per component so ignore_nulls works as intended, then
+    defensively collapse residual whitespace runs and strip the result. A
+    fully-empty address becomes null (dropped by the downstream
+    ``pp_address.is_not_null()`` filter) instead of whitespace junk.
+    """
+    joined = pl.concat_str(
+        [_clean_string(column) for column in columns],
+        separator=" ",
+        ignore_nulls=True,
+    )
+    cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
+    return pl.when(cleaned == "").then(None).otherwise(cleaned)
+
+
 def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
    return (
        raw.select(
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        )
        .filter(pl.col("pp_property_type") != "Other")
        .with_columns(
-            pl.concat_str(
-                [pl.col("saon"), pl.col("paon"), pl.col("street")],
-                separator=" ",
-                ignore_nulls=True,
-            ).alias("pp_address"),
+            _join_address_parts("saon", "paon", "street").alias("pp_address"),
        )
        .with_columns(
            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -102,15 +102,11 @@ _AREA_COLUMNS = [
    # is postcode-grain: it belongs in the area output (one value per postcode,
    # covering property-less postcodes too) rather than duplicated per property.
    TREE_DENSITY_FEATURE,
-    # Schools
-    "Good+ primary schools within 5km",
-    "Good+ secondary schools within 5km",
-    "Good+ primary schools within 2km",
-    "Good+ secondary schools within 2km",
-    "Outstanding primary schools within 5km",
-    "Outstanding secondary schools within 5km",
-    "Outstanding primary schools within 2km",
-    "Outstanding secondary schools within 2km",
+    # Schools (modelled historical catchment areas covering the postcode)
+    "Good+ primary school catchments",
+    "Good+ secondary school catchments",
+    "Outstanding primary school catchments",
+    "Outstanding secondary school catchments",
    # Demographics
    "Median age",
    # Politics
@ -172,14 +168,10 @@ _FINAL_RENAME_COLUMNS = {
    "latest_price": "Last known price",
    "number_habitable_rooms": "Number of bedrooms & living rooms",
    "noise_lden_db": "Noise (dB)",
-    "good_primary_5km": "Good+ primary schools within 5km",
-    "good_secondary_5km": "Good+ secondary schools within 5km",
-    "good_primary_2km": "Good+ primary schools within 2km",
-    "good_secondary_2km": "Good+ secondary schools within 2km",
-    "outstanding_primary_5km": "Outstanding primary schools within 5km",
-    "outstanding_secondary_5km": "Outstanding secondary schools within 5km",
-    "outstanding_primary_2km": "Outstanding primary schools within 2km",
-    "outstanding_secondary_2km": "Outstanding secondary schools within 2km",
+    "good_primary_catchments": "Good+ primary school catchments",
+    "good_secondary_catchments": "Good+ secondary school catchments",
+    "outstanding_primary_catchments": "Outstanding primary school catchments",
+    "outstanding_secondary_catchments": "Outstanding secondary school catchments",
    "max_download_speed": "Max available download speed (Mbps)",
    "serious_crime_avg_yr": "Serious crime (avg/yr)",
    "minor_crime_avg_yr": "Minor crime (avg/yr)",
@ -874,7 +866,7 @@ def _join_area_side_tables(
    election: pl.LazyFrame,
    poi_counts: pl.LazyFrame,
    noise: pl.LazyFrame,
-    school_proximity: pl.LazyFrame,
+    school_catchments: pl.LazyFrame,
    conservation_areas: pl.LazyFrame,
    tree_density: pl.LazyFrame | None,
    broadband: pl.LazyFrame,
@ -905,7 +897,7 @@ def _join_area_side_tables(
    base = base.join(election, on="pcon", how="left")
    base = base.join(poi_counts, on="postcode", how="left")
    base = base.join(noise, on="postcode", how="left")
-    base = base.join(school_proximity, on="postcode", how="left")
+    base = base.join(school_catchments, on="postcode", how="left")
    base = base.join(conservation_areas, on="postcode", how="left").with_columns(
        pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
    )
@ -1970,7 +1962,7 @@ def _build(
    ethnicity_path: Path,
    crime_path: Path,
    noise_path: Path,
-    school_proximity_path: Path,
+    school_catchments_path: Path,
    broadband_path: Path,
    conservation_areas_path: Path,
    rental_prices_path: Path,
@ -2080,7 +2072,7 @@ def _build(
        )
        .select("postcode", "noise_lden_db")
    )
-    school_proximity = pl.scan_parquet(school_proximity_path)
+    school_catchments = pl.scan_parquet(school_catchments_path)
    conservation_areas = _conservation_area_by_postcode(
        arcgis.select("postcode", "lat", "lon"), conservation_areas_path
    )
@ -2120,7 +2112,7 @@ def _build(
        "election": election,
        "poi_counts": poi_counts,
        "noise": noise,
-        "school_proximity": school_proximity,
+        "school_catchments": school_catchments,
        "conservation_areas": conservation_areas,
        "tree_density": tree_density,
        "broadband": broadband,
@ -2267,10 +2259,10 @@ def main():
        "--noise", type=Path, required=True, help="Road noise by postcode parquet file"
    )
    parser.add_argument(
-        "--school-proximity",
+        "--school-catchments",
        type=Path,
        required=True,
-        help="School proximity counts parquet file",
+        help="School catchment counts parquet file",
    )
    parser.add_argument(
        "--broadband",
@ -2376,7 +2368,7 @@ def main():
        ethnicity_path=args.ethnicity,
        crime_path=args.crime,
        noise_path=args.noise,
-        school_proximity_path=args.school_proximity,
+        school_catchments_path=args.school_catchments,
        broadband_path=args.broadband,
        conservation_areas_path=args.conservation_areas,
        rental_prices_path=args.rental_prices,
--- a/pipeline/transform/poi_proximity.py
+++ b/pipeline/transform/poi_proximity.py
@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
 # Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
 GROCERIES_GROUP = "Groceries"

+# Groceries categories EXCLUDED from the static "Number of grocery shops and
+# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
+# are speciality food retail, not somewhere you do a grocery shop; together
+# they were ~a third of the group and inflated the headline count. The metric
+# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
+GROCERY_STATIC_EXCLUDED_CATEGORIES = {
+    "Bakery",
+    "Butcher & Fishmonger",
+    "Deli & Specialty",
+    "Off-Licence",
+}
+
 # OS Open Greenspace function types used for park counts and distance calculation.
 # Uses the authoritative OS dataset instead of OSM point POIs for better coverage
 # of green spaces that are only mapped as polygons in OSM.
+# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
+# (open public recreation grounds) is borderline but kept: outside big cities
+# the local rec ground is the de facto park. "Play Space" (playgrounds) is
+# excluded — a playground is not a park, and "Playground" is already its own
+# OSM-derived category. The remaining functions (Religious Grounds, Golf
+# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
+# Facility) are clearly not parks.
 GREENSPACE_PARK_FUNCTIONS = {
-    "parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
+    "parks": ["Public Park Or Garden", "Playing Field"],
 }

 GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:


 def _groceries_categories(pois: pl.DataFrame) -> list[str]:
-    """Return the distinct `category` values for the Groceries group.
+    """Return the distinct `category` values for the static groceries metric.

    `count_pois_per_postcode` matches POIs on `category`, but the authoritative
    GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
    with group "Groceries"; it never emits the literal "Supermarket". Collecting
    every Groceries category captures both the OSM strings and the brand names.
+    Speciality food retail (bakeries, butchers, delis, off-licences) is
+    excluded — see GROCERY_STATIC_EXCLUDED_CATEGORIES.
    """
    if "group" not in pois.columns:
        raise ValueError("POI dataframe must include a 'group' column")
    return (
-        pois.filter(pl.col("group") == GROCERIES_GROUP)
+        pois.filter(
+            (pl.col("group") == GROCERIES_GROUP)
+            & ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
+        )
        .select("category")
        .unique()
        .sort("category")
@ -109,6 +133,40 @@ def _build_poi_category_groups(
    return groups, display_names


+def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
+    """Collapse the greenspace frame to ONE representative row per site.
+
+    os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
+    right grain for nearest-distance (the nearest gate is what matters) but
+    wildly over-counts "Number of amenities (Park) within Xkm" — a large park
+    with 30 gates counted as 30 parks. Counting uses one row per site at the
+    site centroid (falling back to the first access point when no centroid is
+    available). Degrades gracefully: a legacy parquet without `site_id` is
+    returned unchanged (gate-grain counts) rather than crashing.
+    """
+    if "site_id" not in greenspace.columns:
+        print(
+            "WARNING: greenspace parquet has no site_id column; park counts "
+            "will count access points, not sites (regenerate os_greenspace)"
+        )
+        return greenspace
+
+    keyed = greenspace.filter(pl.col("site_id").is_not_null())
+    unkeyed = greenspace.filter(pl.col("site_id").is_null())
+
+    representatives = keyed.unique(subset=["site_id"], keep="first")
+    if {"site_lat", "site_lng"}.issubset(greenspace.columns):
+        representatives = representatives.with_columns(
+            pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
+            pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
+        )
+
+    frames = [representatives.select(greenspace.columns)]
+    if len(unkeyed) > 0:
+        frames.append(unkeyed)
+    return pl.concat(frames)
+
+
 def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
    renames: dict[str, str] = {}
    for group_key, category in display_names.items():
@ -185,13 +243,16 @@ def main():

    # Park counts and distances from OS Open Greenspace. They use the dynamic
    # amenity metric names so filters read through the same side-table path as
-    # OSM-derived amenity metrics.
+    # OSM-derived amenity metrics. Distances use the access-point grain (the
+    # nearest park GATE is the right semantics); counts use one row per SITE so
+    # a park with many gates counts once.
    greenspace = pl.read_parquet(args.greenspace)
+    greenspace_sites = _greenspace_count_frame(greenspace)
    park_counts_2km = count_pois_per_postcode(
-        postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
+        postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
    )
    park_counts_5km = count_pois_per_postcode(
-        postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
+        postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
    )
    park_distances = min_distance_per_postcode(
        postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
--- a/pipeline/transform/postcode_boundaries/main.py
+++ b/pipeline/transform/postcode_boundaries/main.py
@ -260,6 +260,12 @@ def main() -> None:
    )
    args = parser.parse_args()

+    if args.greenspace and not args.greenspace.exists():
+        # Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
+        # the subtraction is exactly how parks/lakes shipped inside postcode
+        # boundaries unnoticed.
+        raise SystemExit(f"--greenspace file not found: {args.greenspace}")
+
    fragments_cache = args.output / "fragments_cache.parquet"
    # Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
    # so a greenspace change must not invalidate the fragment cache.
@ -294,7 +300,7 @@ def main() -> None:

    greenspace_tree = None
    greenspace_geoms = None
-    if args.greenspace and args.greenspace.exists():
+    if args.greenspace:
        from .greenspace import load_greenspace

        print(f"  Loading greenspace/water from {args.greenspace}...")
--- a/pipeline/transform/postcode_boundaries/greenspace.py
+++ b/pipeline/transform/postcode_boundaries/greenspace.py
@ -3,7 +3,7 @@
 from pathlib import Path

 import polars as pl
-from shapely import wkb
+from shapely import make_valid, wkb
 from shapely.geometry import MultiPolygon, Polygon
 from shapely.strtree import STRtree

@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
 def load_greenspace(path: Path) -> tuple[STRtree, list]:
    """Load greenspace parquet and build an STRtree spatial index.

+    Geometries are repaired with ``make_valid`` on load: an invalid park/lake
+    polygon would make the per-postcode ``intersects`` predicate (and the exact
+    difference path) liable to raise mid-merge, hours into a build. Empty
+    geometries are dropped.
+
    Returns:
        (tree, geoms) where tree is a Shapely STRtree and geoms is
        the list of geometries indexed by the tree.
    """
    df = pl.read_parquet(path)
-    geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
+    geoms = []
+    for raw in df["geometry"].to_list():
+        geom = wkb.loads(raw)
+        if not geom.is_valid:
+            geom = make_valid(geom)
+        if not geom.is_empty:
+            geoms.append(geom)
    tree = STRtree(geoms)
    return tree, geoms

--- a/pipeline/transform/postcode_boundaries/output.py
+++ b/pipeline/transform/postcode_boundaries/output.py
@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
    return geojson_dict


+def _is_pointlike(geom_bng) -> bool:
+    """True if a BNG geometry carries no real extent (tower-block signature).
+
+    Near-zero area AND short perimeter together distinguish a collapsed point
+    from a genuine thin sliver, which still carries length.
+    """
+    try:
+        return (
+            geom_bng.area < _POINTLIKE_AREA_M2
+            and geom_bng.length < _POINTLIKE_PERIMETER_M
+        )
+    except GEOSException:
+        return False
+
+
 def _rescue_footprint(geom_bng) -> dict | None:
    """Fatten a degenerate BNG geometry into a representable footprint and snap.

@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
    gets a building-scale buffer so it is not reduced to an invisible sub-metre
    dot; thin slivers that still carry length keep the minimal buffer.
    """
-    buffer_m = _MIN_FOOTPRINT_BUFFER_M
-    try:
-        if (
-            geom_bng.area < _POINTLIKE_AREA_M2
-            and geom_bng.length < _POINTLIKE_PERIMETER_M
-        ):
-            buffer_m = _POINT_RESCUE_BUFFER_M
-    except GEOSException:
-        pass
+    buffer_m = (
+        _POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
+    )
    footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
    if footprint is None:
        return None
@ -147,6 +156,12 @@ def to_wgs84_geojson(
        )
        if simplified is None:
            simplified = cleaned
+        if _is_pointlike(simplified):
+            # A POINTLIKE footprint is rescued to building scale even when it
+            # would survive snapping: a 0.1-1 m² polygon serializes fine but
+            # ships as an invisible dot covering a whole tower block.
+            result = _rescue_footprint(simplified)
+        else:
            # Normal path; if snapping erases a thin sliver, fatten its real shape.
            result = _snap_to_wgs84_geojson(simplified)
            if result is None:
@ -229,6 +244,10 @@ def merge_fragments(
        greenspace_tree: Optional STRtree of park/water polygons.
        greenspace_geoms: Optional list of park/water geometries (indexed by tree).
    """
+    subtract = greenspace_tree is not None and greenspace_geoms is not None
+    if subtract:
+        from .greenspace import subtract_greenspace
+
    by_postcode: dict[str, list] = defaultdict(list)
    for pc, geom in all_fragments:
        by_postcode[pc].append(geom)
@ -256,9 +275,7 @@ def merge_fragments(
        # Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
        combined = _fill_holes(combined)
        # Subtract parks/water if provided
-        if greenspace_tree is not None and greenspace_geoms is not None:
-            from .greenspace import subtract_greenspace
-
+        if subtract:
            pre_green = combined
            combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
            combined = _keep_polygon_parts(combined)
--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -921,6 +921,49 @@ class TestToWgs84Geojson:
        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
        assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"

+    def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
+        """A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
+        0.86 m²) must NOT ship as-is just because it survives precision snapping;
+        pointlike inputs are rescued to a ~201 m² disc unconditionally."""
+        import pyproj
+        from shapely.geometry import shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        # 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
+        # large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
+        tiny = box(530000, 180000, 530000.9, 180000.9)
+        from .output import _snap_to_wgs84_geojson
+
+        assert _snap_to_wgs84_geojson(tiny) is not None, (
+            "precondition: this polygon must be snappable, otherwise the test "
+            "exercises the old snap-fails path instead of the new one"
+        )
+        result = to_wgs84_geojson(tiny)
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert 150 < area_m2 < 300, (
+            f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
+            "instead of a building-scale (~201 m^2) disc"
+        )
+
+    def test_normal_polygon_area_unchanged(self):
+        """A normal polygon must pass through without rescue inflation."""
+        import pyproj
+        from shapely.geometry import shape
+        from shapely.ops import transform as transform_geometry
+
+        to_bng = pyproj.Transformer.from_crs(
+            "EPSG:4326", "EPSG:27700", always_xy=True
+        )
+        poly = box(530000, 180000, 530100, 180100)  # 10,000 m²
+        result = to_wgs84_geojson(poly)
+        assert result is not None
+        area_m2 = transform_geometry(to_bng.transform, shape(result)).area
+        assert area_m2 == pytest.approx(10_000, rel=0.01)
+
    def test_thin_sliver_keeps_minimal_buffer(self):
        """A genuine elongated sliver still carries length, so it is NOT inflated
        to building scale — only truly pointlike inputs are."""
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
        # 80% < 90% cap, so subtraction should happen
        assert result.area == pytest.approx(2000, rel=0.01)

+    def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
+        """An invalid (bow-tie) park polygon in the parquet must be repaired on
+        load: it would otherwise make the per-postcode intersects/difference
+        liable to raise hours into a merge."""
+        from .greenspace import load_greenspace
+
+        bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)])  # self-intersects
+        assert not bowtie.is_valid
+        valid = box(20, 20, 30, 30)
+        path = tmp_path / "greenspace.parquet"
+        pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
+
+        tree, geoms = load_greenspace(path)
+        assert len(geoms) == 2
+        assert all(g.is_valid and not g.is_empty for g in geoms)
+        # The repaired bow-tie must still subtract cleanly.
+        result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
+        assert result.is_valid
+        assert result.area < 10_000
+

 class TestToWgs84GeojsonValidity:
    """to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -26,6 +26,7 @@ from pipeline.transform.price_estimation.shrinkage import (
 from pipeline.transform.price_estimation.utils import (
    CURRENT_YEAR,
    LATEST_COMPLETE_YEAR,
+    SMOOTHNESS_SUPPORT_PAIRS,
    TEMPORAL_SMOOTHNESS_LAMBDA,
    TYPE_GROUPS,
    build_hedonic_features,
@ -37,6 +38,19 @@ from pipeline.transform.price_estimation.utils import (

 MIN_PAIRS = 5
 OUTLIER_THRESHOLD = 3.0  # hard pre-filter; Huber handles the rest
+# Gap-aware companion to OUTLIER_THRESHOLD: |log_ratio| must also stay within
+# this many log-units PER YEAR of holding period (short gaps are allowed a
+# full year's band). A flat +/-3.0 cap admits e.g. a 10k -> 196k "sale" six
+# months apart (log +2.95, and weight 1/sqrt(gap) gives it the leverage of
+# ~10 normal pairs); Huber does NOT recover, because once the thin year's
+# beta satisfies the garbage pair it is the many good long-gap pairs that
+# carry the residual and get down-weighted. Such pairs are data errors or
+# non-market transfers (right-to-buy, probate, flips), not house-price
+# signal -- standard repeat-sales practice (Case-Shiller) excludes extreme
+# annualised returns for the same reason. 0.7 log/yr (~2x in a year) keeps
+# any plausible genuine market move; long-gap pairs are still governed by
+# the +/-3.0 cap.
+ANNUALISED_OUTLIER_THRESHOLD = 0.7
 HUBER_K = 1.345
 IRLS_ITERATIONS = 5

@ -111,7 +125,16 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
                / (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
            ).alias("weight"),
        )
-        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
+        .filter(
+            pl.col("log_ratio").abs()
+            <= pl.min_horizontal(
+                pl.lit(OUTLIER_THRESHOLD),
+                ANNUALISED_OUTLIER_THRESHOLD
+                * pl.max_horizontal(
+                    pl.col("frac_year2") - pl.col("frac_year1"), pl.lit(1.0)
+                ),
+            )
+        )
        .collect()
    )

@ -181,11 +204,27 @@ def solve_robust_index(
    # beta=0) has no column, so the penalty spans the non-baseline years only.
    # For cells with <3 betas there is no curvature to penalise and the solve is
    # unchanged.
+    #
+    # The penalty is SUPPORT-SCALED per row: a flat lambda is too weak for
+    # years identified by only 1-2 repeat-sale pairs (a cell can have hundreds
+    # of pairs overall yet single thin years, yielding 2-7x one-year spikes
+    # that cell-level shrinkage cannot catch). Each curvature row's lambda is
+    # lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s), with s the minimum
+    # cross-year pair count among the row's three years, so thin years are
+    # pulled strongly toward the local trend while well-supported years keep
+    # the baseline penalty. Taking the min over the triple (not just the
+    # middle year) also covers thin FIRST/LAST years of the range, which only
+    # ever appear at a triple's edge -- the last solved year feeds the
+    # CURRENT_YEAR trend extrapolation, so spikes there are the costliest.
    n_pen = 0
    pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
    pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
    if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
-        sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
+        cross = years1 != years2
+        touched, counts = np.unique(
+            np.concatenate([years1[cross], years2[cross]]), return_counts=True
+        )
+        support = {int(y): int(c) for y, c in zip(touched, counts)}
        years_sorted = sorted(year_to_col)
        cols_by_year = [year_to_col[y] for y in years_sorted]
        n_pen = n_cols - 2
@ -202,6 +241,11 @@ def solve_robust_index(
            w0 = 2.0 / ((y1 - y0) * (y2 - y0))
            w1 = -2.0 / ((y1 - y0) * (y2 - y1))
            w2 = 2.0 / ((y2 - y1) * (y2 - y0))
+            s_k = min(support.get(y, 0) for y in (y0, y1, y2))
+            lam_k = TEMPORAL_SMOOTHNESS_LAMBDA * (
+                1.0 + SMOOTHNESS_SUPPORT_PAIRS / max(s_k, 1)
+            )
+            sqrt_lambda = float(np.sqrt(lam_k))
            pen_vals[3 * k : 3 * k + 3] = (
                sqrt_lambda * w0,
                sqrt_lambda * w1,
@ -347,10 +391,22 @@ def compute_hedonic_index(


 EXTRAPOLATION_YEARS = 3
+# Bound on the per-year slope used to trend-extrapolate beyond the last solved
+# year (the solve stops at LATEST_COMPLETE_YEAR; CURRENT_YEAR is filled here).
+# +/-0.10 log/yr (~+/-10.5%/yr) comfortably covers genuine UK sector-level
+# annual moves while preventing a residual spike in the recent betas from
+# compounding into an absurd extrapolated step (e.g. +49% in one year).
+MAX_EXTRAPOLATION_SLOPE = 0.10


 def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
-    """Forward-fill missing years, with linear extrapolation beyond last known year."""
+    """Forward-fill missing years, with trend extrapolation beyond last known year.
+
+    The extrapolation slope is the MEDIAN of the per-year slopes between
+    consecutive known points in the recent window (a single noisy year corrupts
+    at most one of those slopes, unlike a least-squares fit through all the
+    points), clamped to +/-MAX_EXTRAPOLATION_SLOPE.
+    """
    if not index:
        return {y: 0.0 for y in range(min_year, max_year + 1)}

@ -365,7 +421,7 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
            last = index[y]
        filled[y] = last

-    # Linear extrapolation beyond last known year
+    # Robust trend extrapolation beyond last known year
    if last_known_year < max_year:
        recent = [
            (y, index[y])
@ -373,9 +429,17 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
            if y >= last_known_year - EXTRAPOLATION_YEARS
        ]
        if len(recent) >= 2:
-            years_arr = np.array([r[0] for r in recent], dtype=np.float64)
-            vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
-            slope = np.polyfit(years_arr, vals_arr, 1)[0]
+            slopes = [
+                (v_b - v_a) / (y_b - y_a)
+                for (y_a, v_a), (y_b, v_b) in zip(recent[:-1], recent[1:])
+            ]
+            slope = float(
+                np.clip(
+                    np.median(slopes),
+                    -MAX_EXTRAPOLATION_SLOPE,
+                    MAX_EXTRAPOLATION_SLOPE,
+                )
+            )
            for y in range(last_known_year + 1, max_year + 1):
                filled[y] = index[last_known_year] + slope * (y - last_known_year)
        else:
@ -389,12 +453,16 @@ def build_index(
    input_path: Path,
    max_pair_year: int | None = None,
    postcodes_path: Path | None = None,
+    sectors: list[str] | None = None,
 ) -> pl.DataFrame:
    """Build the full price index from raw data.

    If max_pair_year is set, only pairs before that year are used (backtesting holdout).
    The index is still forward-filled to CURRENT_YEAR.
    postcodes_path: if provided, lat/lon are read from this file instead of input_path.
+    sectors: if provided, restrict the build to these postcode sectors (for
+    debugging/verification runs; hierarchy levels are then computed only from
+    the scoped pairs, so scoped output is NOT identical to a full build).
    """
    # Solve the index only on COMPLETE calendar years: exclude the partial
    # current year, whose thin repeat-sale set yields wild betas. The index is
@ -405,6 +473,9 @@ def build_index(
        max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
    )
    pairs = extract_pairs(input_path, max_year2=estimation_cap)
+    if sectors is not None:
+        pairs = pairs.filter(pl.col("sector").is_in(sectors))
+        print(f"  Scoped to {len(sectors)} sectors: {len(pairs):,} pairs")
    centroids = extract_centroids(postcodes_path or input_path)

    min_year = int(pairs["year1"].min())
@ -534,9 +605,21 @@ def main():
        help="Path to postcode.parquet (for lat/lon centroids)",
    )
    parser.add_argument("--output", type=Path, required=True)
+    parser.add_argument(
+        "--sectors",
+        type=str,
+        default=None,
+        help="Comma-separated postcode sectors to scope the build to "
+        "(debug/verification only; hierarchy is computed from scoped pairs)",
+    )
    args = parser.parse_args()

-    result = build_index(args.input, postcodes_path=args.postcodes)
+    sectors = (
+        [s.strip() for s in args.sectors.split(",") if s.strip()]
+        if args.sectors
+        else None
+    )
+    result = build_index(args.input, postcodes_path=args.postcodes, sectors=sectors)

    result.write_parquet(args.output)
    size_mb = args.output.stat().st_size / (1024 * 1024)
--- a/pipeline/transform/price_estimation/test_index.py
+++ b/pipeline/transform/price_estimation/test_index.py
@ -3,7 +3,10 @@ import polars as pl

 from pipeline.transform.price_estimation import index as index_mod
 from pipeline.transform.price_estimation.index import (
+    MAX_EXTRAPOLATION_SLOPE,
    compute_indices_for_level,
+    extract_pairs,
+    forward_fill,
    solve_robust_index,
 )

@ -105,6 +108,139 @@ def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
    assert abs(idx[2015] - true[2015]) < 0.05


+def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
+    """Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
+    pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
+    years = range(2010, 2021)
+    true = {y: 0.04 * (y - 2010) for y in years}
+    y1, y2, lr, w = [], [], [], []
+    for a in range(2010, 2020):
+        for _ in range(ramp_reps):
+            y1.append(a)
+            y2.append(a + 1)
+            lr.append(true[a + 1] - true[a])
+            w.append(1.0)
+    for _ in range(tail_n):
+        y1.append(2020)
+        y2.append(2021)
+        lr.append(tail_ratio)
+        w.append(1.0)
+    return (
+        np.array(y1, dtype=np.int32),
+        np.array(y2, dtype=np.int32),
+        np.array(lr, dtype=np.float64),
+        np.array(w, dtype=np.float64),
+    )
+
+
+def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
+    """A final year identified by a SINGLE pair claiming a +1.5 log jump is
+    pulled strongly toward the local trend; with the flat baseline penalty
+    (support scaling off) the jump survives almost entirely. The thin year is
+    the LAST year of the range (only ever at a penalty triple's edge), proving
+    the min-over-triple support rule covers range edges -- the last solved year
+    feeds the CURRENT_YEAR trend extrapolation."""
+    y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)
+
+    monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
+    flat = solve_robust_index(y1, y2, lr, w)
+    monkeypatch.undo()
+    scaled = solve_robust_index(y1, y2, lr, w)
+
+    flat_step = flat[2021] - flat[2020]
+    scaled_step = scaled[2021] - scaled[2020]
+    assert flat_step > 1.2  # flat lambda barely resists the spike
+    assert scaled_step < 0.65  # support-scaled lambda suppresses it
+    # The well-supported ramp stays close to truth: the strong penalty row
+    # spanning the thin year drags its immediate neighbour slightly (<0.1)
+    # toward collinearity -- the price of suppressing a x4.5 one-year spike.
+    for y in range(2010, 2021):
+        assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1
+
+
+def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
+    """With ample pairs everywhere (support 50-100 per year), lambda_eff ~
+    lambda0 and the solution matches the flat-penalty solve to <1e-3."""
+    y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)
+
+    monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
+    flat = solve_robust_index(y1, y2, lr, w)
+    monkeypatch.undo()
+    scaled = solve_robust_index(y1, y2, lr, w)
+
+    assert set(flat) == set(scaled)
+    assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3
+
+
+def test_forward_fill_extrapolation_uses_robust_median_slope():
+    """A residual spike in ONE recent year must not corrupt the extrapolated
+    step: the median of consecutive per-year slopes ignores it (a least-squares
+    fit through the same points would extrapolate a large positive slope)."""
+    index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
+    filled = forward_fill(index, 2022, 2026)
+    # slopes: [+0.05, +0.55, -0.50] -> median +0.05
+    assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9
+
+
+def test_forward_fill_extrapolated_slope_is_clamped():
+    """A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
+    index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
+    filled = forward_fill(index, 2022, 2026)
+    assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9
+
+    index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
+    filled_down = forward_fill(index_down, 2022, 2026)
+    assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9
+
+
+def test_forward_fill_preserves_sane_trend_and_flat_fallback():
+    """Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
+    trend); with <2 recent points the fill is flat."""
+    index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
+    filled = forward_fill(index, 2022, 2026)
+    assert abs(filled[2026] - 1.20) < 1e-9
+
+    assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7
+
+
+def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
+    """A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
+    error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
+    Such pairs are dropped via the annualised cap; large ratios over long
+    holding periods (genuine appreciation) are kept."""
+    df = pl.DataFrame(
+        {
+            "Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
+            "Property type": ["Detached", "Detached", "Detached"],
+            "historical_prices": [
+                # +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
+                [
+                    {"year": 2020, "month": 1, "price": 100_000},
+                    {"year": 2020, "month": 7, "price": 1_000_000},
+                ],
+                # +2.20 log over 24 years -> kept (flat 3.0 cap governs)
+                [
+                    {"year": 2000, "month": 1, "price": 100_000},
+                    {"year": 2024, "month": 1, "price": 900_000},
+                ],
+                # +0.41 log in 1 year -> kept (within the 0.7/yr band)
+                [
+                    {"year": 2020, "month": 1, "price": 100_000},
+                    {"year": 2021, "month": 1, "price": 150_000},
+                ],
+            ],
+        }
+    )
+    path = tmp_path / "props.parquet"
+    df.write_parquet(path)
+
+    pairs = extract_pairs(path)
+
+    assert len(pairs) == 2
+    ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
+    assert ratios == [0.41, 2.2]
+
+
 def test_n_pairs_counts_only_cross_year_pairs():
    """FIX #12: same-year pairs carry zero index information and must not inflate
    the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
--- a/pipeline/transform/price_estimation/utils.py
+++ b/pipeline/transform/price_estimation/utils.py
@ -36,6 +36,20 @@ SHRINKAGE_K = 50
 # noisy year) without flattening genuine multi-year trends.
 TEMPORAL_SMOOTHNESS_LAMBDA = 0.05

+# Per-year support scaling for the temporal smoothness penalty. A flat lambda
+# is too weak for years with very few repeat-sale pairs: a sector can have
+# hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it)
+# yet have individual years estimated from 1-2 pairs, producing 2-7x
+# single-year index spikes. Each curvature row is therefore scaled by the
+# local pair support of its year triple:
+#   lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s)
+# where s is the minimum cross-year pair count among the triple's years.
+# Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~
+# lambda0 (current behaviour); a year identified by a single pair gets
+# ~41x lambda0, pulling its beta strongly toward the local trend through its
+# neighbours. Same-year pairs cancel in the design and are not counted.
+SMOOTHNESS_SUPPORT_PAIRS = 40
+

 def type_group_expr():
    """Polars expression: Property type -> type_group."""
--- a/pipeline/transform/school_catchments.py
+++ b/pipeline/transform/school_catchments.py
@ -0,0 +1,748 @@
+"""Model historical school catchment areas and count them per postcode.
+
+No national dataset of school catchment areas exists for England: catchments
+are set per admission authority, only a handful of councils publish polygons,
+and the pupil-residence data behind commercial "heatmap" catchments lives in
+the restricted National Pupil Database. This module therefore COMPILES one
+from open data, estimating each school's admission cutoff distance ("last
+distance offered") — the radius within which an applicant would plausibly be
+offered a place.
+
+Model: English state admissions are run as deferred acceptance with distance
+tie-breaks, which in a continuum economy is equivalent to finding
+market-clearing cutoff distances (Azevedo & Leshno 2016). Per phase
+(primary/secondary):
+
+1. Demand — Census 2021 children per LSOA (TS007A age bands, prorated to the
+   phase's cohort ages) split evenly across the LSOA's live postcodes.
+2. Supply — every open, non-selective state-funded school (GIAS), with a fill
+   target of max(capacity, headcount) prorated to the phase's cohorts
+   (sixth-form and nursery years carry reduced weight, since their class
+   sizes differ and they are not allocated by the same admissions round).
+3. Preferences — children prefer nearby schools, trading distance against
+   Ofsted grade: a school's effective distance is its real distance minus a
+   grade bonus (Outstanding > Good > ungraded > below-Good). Because real
+   first preferences are heterogeneous, each postcode's children split
+   across nearby feasible schools with logit weights over effective
+   distance rather than all picking the same one.
+4. Equilibrium — cutoffs start unbounded and tighten monotonically: each
+   round, children apply to their preferred feasible school(s), and
+   oversubscribed schools tighten their cutoff to the distance of their
+   marginal admitted child. Converges to the deferred-acceptance outcome.
+5. Schools that never fill have no binding cutoff — anyone who applies gets
+   in — so their feasibility radius is the distance within which the local
+   child population would cover their fill target, capped.
+
+The free parameters (preference bonuses, demand scale, choice temperature,
+residual calibration factors) are CALIBRATED against published "last
+distance offered" figures scraped from nine local authorities' allocation
+reports — see check_school_cutoffs.py and the constants below.
+
+A postcode is "inside the catchment" of every school whose cutoff radius
+covers it. The output counts those schools per postcode for the four
+good+/outstanding x primary/secondary categories (Ofsted-classified, same
+rules as the previous proximity metric). Selective (grammar) schools are
+excluded throughout: their intakes are test-based and region-wide, so a
+distance model would fabricate a catchment that does not exist.
+
+Known limitations: faith oversubscription criteria are not modelled (whether
+a faith school's catchment is open to a given family depends on the family),
+and Census 2021 child counts lag current rolls slightly. Cutoffs are
+straight-line distances, the modal LA tie-break criterion.
+"""
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+import polars as pl
+from scipy.spatial import cKDTree
+
+from pipeline.utils.poi_counts import _project_lat_lng_km, valid_uk_coords_mask
+
+SCHOOL_GROUPS = {
+    "good_primary": ["good_primary", "outstanding_primary"],
+    "good_secondary": ["good_secondary", "outstanding_secondary"],
+    "outstanding_primary": ["outstanding_primary"],
+    "outstanding_secondary": ["outstanding_secondary"],
+}
+
+# Age thresholds for deciding which phase(s) a school serves. A school serves
+# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
+# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
+# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
+# both the primary and the secondary metrics — Ofsted's coarse "Ofsted phase"
+# labels such schools as just "Secondary", which previously hid them from every
+# postcode's primary-school count.
+PRIMARY_MAX_AGE = 10
+SECONDARY_MIN_AGE = 12
+
+# Cohort ages (inclusive) each phase competes for: Reception-Y6 and Y7-Y11.
+PRIMARY_AGES = (4, 10)
+SECONDARY_AGES = (11, 15)
+
+# Cohort weights for prorating a school's headcount/capacity across the ages
+# it teaches. Nursery classes are typically part-time and small; sixth forms
+# run at roughly 60% of a school's Y7-Y11 cohort size. A flat proration
+# undersupplied secondary places by ~8%.
+NURSERY_COHORT_WEIGHT = 0.5  # ages < 4
+SIXTH_FORM_COHORT_WEIGHT = 0.6  # ages >= 16
+
+# Only schools that admit (mostly) by geography take part in the assignment.
+# Independent, special and Welsh schools and post-16 colleges either don't
+# admit by distance or fall outside the England postcode universe; selective
+# (grammar) schools admit by test from a wide region.
+STATE_SCHOOL_TYPE_GROUPS = [
+    "Academies",
+    "Local authority maintained schools",
+    "Free Schools",
+]
+
+# Preference bonuses (km of extra travel a family accepts for a better
+# school), applied as a discount on effective distance when children choose.
+# Grade 3/4 schools repel by the same magnitudes.
+PREF_BONUS_OUTSTANDING_KM = 0.6
+PREF_BONUS_GOOD_KM = 0.3
+
+# Share of resident children who actually compete for state places. Census
+# 2021 counts overstate current entry cohorts (birth rates fell ~10% between
+# 2016 and 2021, which is exactly the gap between the census stock and the
+# children reaching Reception by mid-decade) and independent/home-educated
+# children (~7%) never enter the allocation at all. Without this, modelled
+# cutoffs run systematically tight and undersubscribed schools look full.
+DEMAND_SCALE = 0.8
+
+# Logit choice temperature (km). With deterministic choice every child at a
+# postcode ranks the same school first, so popular schools fill entirely from
+# their nearest band and the marginal admitted child sits unrealistically
+# close. Real first preferences are heterogeneous; a school draws only a
+# distance-decaying share of nearby families. Children therefore split across
+# nearby feasible schools with weights softmax(-effective_distance / tau):
+# higher tau = more smearing = wider cutoffs. tau -> 0 recovers the
+# deterministic model (used by the unit tests). Calibrated 2026-06 against
+# 240 published binding cutoffs from 9 LAs (check_school_cutoffs.py): 0.3 km
+# maximises rank correlation and within-2x share; beyond ~0.6 the smearing
+# erases school-to-school differentiation (Spearman 0.24 -> 0.01).
+CHOICE_TEMPERATURE_KM = 0.3
+
+# Residual calibration from the same ground truth: after the equilibrium
+# solve, modelled cutoffs still ran systematically tight (median log2 bias
+# -0.53 primary / -0.36 secondary at the settings above — published "last
+# distance offered" reflects offer-day frictions, waiting-list churn and
+# furthest-applicant noise that no clean equilibrium reproduces). Radii are
+# multiplied by 2^-bias so the modelled median matches the published median;
+# rank ordering is unaffected.
+CUTOFF_CALIBRATION_FACTOR = {"primary": 1.44, "secondary": 1.28}
+
+# Each demand postcode considers this many nearest schools; beyond ~16
+# candidates assignment shares are negligible.
+NEAREST_SCHOOL_CANDIDATES = 16
+
+# Radius guard rails: the floor absorbs postcode-centroid noise around tiny
+# urban catchments; the cap bounds feasibility radii for schools the model
+# never fills (mostly rural).
+MIN_RADIUS_KM = 0.3
+MAX_RADIUS_KM = 25.0
+
+EQUILIBRIUM_MAX_ITER = 100
+
+
+def classify_good_plus_schools(
+    ofsted: pl.DataFrame, open_urns: set[int] | None = None
+) -> pl.DataFrame:
+    """Label good+/outstanding primary & secondary schools for catchment counts.
+
+    Derives a grade ("1" = outstanding, "2" = good) and one or two
+    ``category`` rows per school, returning a ``(urn, category)`` frame.
+
+    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
+    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
+    Framework). A large and growing share of schools were last inspected under an
+    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
+    that column is null/"Not judged" for them even when they are demonstrably
+    good — their status lives in "Ungraded inspection overall outcome" ("School
+    remains Good"/"School remains Outstanding"). Filtering on the graded column
+    alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
+    ungraded outcome, but ONLY when there is no usable graded result
+    (null/"Not judged"), so a genuine grade 3/4 is never overridden.
+
+    Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
+    (Concerns)" outcome signals inspectors found issues warranting an earlier
+    graded re-inspection, so marketing it as a good+ school is misleading.
+
+    Phase assignment uses the statutory age range when available (so all-through
+    and middle schools count toward BOTH primary and secondary), falling back to
+    the coarse "Ofsted phase" label when age columns are absent. When
+    ``open_urns`` is given, schools whose URN is not in the current GIAS open
+    register are dropped so closed/merged schools are not counted.
+    """
+    graded = _with_derived_grade(ofsted).filter(
+        pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
+        & pl.col("_ofsted_grade").is_in(["1", "2"])
+    )
+
+    # Drop schools no longer open (closed/merged) when the GIAS open register is
+    # provided, so stale Ofsted "latest inspection" rows are not counted.
+    if open_urns is not None and "URN" in graded.columns:
+        graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
+
+    # Decide which phase(s) each school serves.
+    if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
+        low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
+        high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
+        serves_primary = (
+            pl.when(low.is_not_null())
+            .then(low <= PRIMARY_MAX_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Primary")
+        )
+        serves_secondary = (
+            pl.when(high.is_not_null())
+            .then(high >= SECONDARY_MIN_AGE)
+            .otherwise(pl.col("Ofsted phase") == "Secondary")
+        )
+    else:
+        serves_primary = pl.col("Ofsted phase") == "Primary"
+        serves_secondary = pl.col("Ofsted phase") == "Secondary"
+
+    graded = graded.with_columns(
+        serves_primary.alias("_serves_primary"),
+        serves_secondary.alias("_serves_secondary"),
+    )
+
+    # Good+ groups include both grade variants; outstanding groups count grade 1.
+    # A school can yield up to two rows (primary and secondary).
+    primary = graded.filter(pl.col("_serves_primary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_primary"))
+        .otherwise(pl.lit("good_primary"))
+        .alias("category")
+    )
+    secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
+        pl.when(pl.col("_ofsted_grade") == "1")
+        .then(pl.lit("outstanding_secondary"))
+        .otherwise(pl.lit("good_secondary"))
+        .alias("category")
+    )
+    return pl.concat([primary, secondary]).select(
+        pl.col("URN").cast(pl.Int64).alias("urn"),
+        "category",
+    )
+
+
+def _with_derived_grade(ofsted: pl.DataFrame) -> pl.DataFrame:
+    """Attach ``_ofsted_grade`` ("1"-"4" or null): graded OEIF result first,
+    falling back to ungraded "School remains Good/Outstanding" outcomes (minus
+    "(Concerns)") only when there is no usable graded result."""
+    # Cast to Utf8 so the string predicates below are well-defined even if a
+    # column happens to be entirely null (read back as a Null dtype).
+    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
+    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
+    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
+    has_concern = ungraded.str.contains(r"\(Concerns\)")
+    remains_outstanding = (
+        ungraded.str.starts_with("School remains Outstanding") & ~has_concern
+    )
+    remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
+    return ofsted.with_columns(
+        pl.when(oeif.is_in(["1", "2", "3", "4"]))
+        .then(oeif)
+        .when(no_usable_grade & remains_outstanding)
+        .then(pl.lit("1"))
+        .when(no_usable_grade & remains_good)
+        .then(pl.lit("2"))
+        .otherwise(None)
+        .alias("_ofsted_grade")
+    )
+
+
+def school_preference_bonuses(
+    ofsted: pl.DataFrame,
+    bonus_outstanding_km: float = PREF_BONUS_OUTSTANDING_KM,
+    bonus_good_km: float = PREF_BONUS_GOOD_KM,
+) -> pl.DataFrame:
+    """Per-school preference bonus in km, from the derived Ofsted grade.
+
+    Outstanding/Good schools attract demand from further away; grade 3/4
+    schools repel it symmetrically. Ungraded (typically new) schools are
+    neutral. Returns ``(urn, bonus_km)`` with one row per URN.
+    """
+    bonus = {
+        "1": bonus_outstanding_km,
+        "2": bonus_good_km,
+        "3": -bonus_good_km,
+        "4": -bonus_outstanding_km,
+    }
+    return (
+        _with_derived_grade(ofsted)
+        .filter(pl.col("URN").is_not_null())
+        .select(
+            pl.col("URN").cast(pl.Int64).alias("urn"),
+            pl.col("_ofsted_grade")
+            .replace_strict(bonus, default=0.0, return_dtype=pl.Float64)
+            .alias("bonus_km"),
+        )
+        .unique(subset="urn", keep="first")
+    )
+
+
+def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
+    """Per-school phase-prorated fill targets for the admissions model.
+
+    Returns one row per open, non-selective state-funded school with valid
+    coordinates: ``(urn, lat, lng, primary_intake, secondary_intake)``. The
+    fill target — max(capacity, headcount), so over-full schools keep their
+    demonstrated size and under-full schools can admit up to capacity — is
+    spread over the cohort ages the school teaches (parsed from ``age_range``,
+    e.g. "3–11" = ages 3..10) with nursery and sixth-form ages down-weighted,
+    and each phase receives the share of cohort weight in its age band.
+    """
+    ages = pl.col("age_range").str.extract_all(r"\d+")
+    low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
+    # The leaving age is exclusive as a cohort: a "3-11" school teaches
+    # children aged 3 through 10.
+    high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
+
+    schools = (
+        gias.filter(
+            pl.col("type_group").is_in(STATE_SCHOOL_TYPE_GROUPS)
+            & (
+                pl.col("admissions_policy").is_null()
+                | (pl.col("admissions_policy") != "Selective")
+            )
+            & pl.col("lat").is_not_null()
+            & pl.col("lng").is_not_null()
+        )
+        .with_columns(low.alias("_low"), high.alias("_high"))
+        .filter(pl.col("_low").is_not_null() & (pl.col("_high") >= pl.col("_low")))
+        .with_columns(
+            pl.max_horizontal(
+                pl.col("pupils").fill_null(0), pl.col("capacity").fill_null(0)
+            )
+            .cast(pl.Float64)
+            .alias("_fill_target"),
+        )
+        .filter(pl.col("_fill_target") > 0)
+    )
+
+    def weighted_overlap(lo: int, hi: int, weight: float = 1.0) -> pl.Expr:
+        """Cohort weight contributed by ages [lo, hi] within [_low, _high]."""
+        return (
+            weight
+            * (
+                pl.min_horizontal(pl.col("_high"), hi)
+                - pl.max_horizontal(pl.col("_low"), lo)
+                + 1
+            ).clip(lower_bound=0)
+        ).cast(pl.Float64)
+
+    total_weight = (
+        weighted_overlap(0, 3, NURSERY_COHORT_WEIGHT)
+        + weighted_overlap(4, 15)
+        + weighted_overlap(16, 30, SIXTH_FORM_COHORT_WEIGHT)
+    )
+    return schools.select(
+        pl.col("urn").cast(pl.Int64),
+        "lat",
+        "lng",
+        (pl.col("_fill_target") * weighted_overlap(*PRIMARY_AGES) / total_weight).alias(
+            "primary_intake"
+        ),
+        (
+            pl.col("_fill_target") * weighted_overlap(*SECONDARY_AGES) / total_weight
+        ).alias("secondary_intake"),
+    )
+
+
+def children_per_postcode(
+    postcodes: pl.DataFrame, lsoa_children: pl.DataFrame
+) -> pl.DataFrame:
+    """Estimate phase-age children living at each live postcode.
+
+    Census age bands don't align with school phases, so phase totals take
+    fractional shares of bands (one fifth per single year of age): primary
+    (4-10) = age 4 + ages 5-9 + age 10, secondary (11-15) = ages 11-14 +
+    age 15. LSOA totals are then split evenly across the LSOA's postcodes.
+    """
+    lsoa = lsoa_children.select(
+        "lsoa21",
+        (
+            0.2 * pl.col("aged_0_4") + pl.col("aged_5_9") + 0.2 * pl.col("aged_10_14")
+        ).alias("_lsoa_primary"),
+        (0.8 * pl.col("aged_10_14") + 0.2 * pl.col("aged_15_19")).alias(
+            "_lsoa_secondary"
+        ),
+    )
+    return (
+        postcodes.join(lsoa, left_on="lsoa21cd", right_on="lsoa21", how="inner")
+        .with_columns(pl.len().over("lsoa21cd").alias("_lsoa_postcodes"))
+        .select(
+            "postcode",
+            "lat",
+            "lng",
+            (pl.col("_lsoa_primary") / pl.col("_lsoa_postcodes")).alias(
+                "primary_children"
+            ),
+            (pl.col("_lsoa_secondary") / pl.col("_lsoa_postcodes")).alias(
+                "secondary_children"
+            ),
+        )
+    )
+
+
+def equilibrium_cutoffs(
+    school_xy: np.ndarray,
+    fill_target: np.ndarray,
+    bonus_km: np.ndarray,
+    pc_xy: np.ndarray,
+    pc_children: np.ndarray,
+    k: int = NEAREST_SCHOOL_CANDIDATES,
+    max_iter: int = EQUILIBRIUM_MAX_ITER,
+    tau_km: float = CHOICE_TEMPERATURE_KM,
+) -> np.ndarray:
+    """Market-clearing admission cutoff distance (km) per school.
+
+    Deferred acceptance with distance priority, solved as cutoff dynamics
+    (Azevedo & Leshno): cutoffs start unbounded; each round every child unit
+    applies to its preferred feasible school(s) — a logit split over
+    effective distance (distance - school bonus) among schools whose cutoff
+    covers it, collapsing to the single best school when ``tau_km`` is 0 —
+    and each oversubscribed school tightens its cutoff to its marginal
+    admitted child's distance. Cutoffs only ever tighten, so the iteration
+    converges.
+
+    Returns np.inf for schools that never fill (no binding cutoff).
+    """
+    n_schools = len(school_xy)
+    k = min(k, n_schools)
+    demand = np.flatnonzero(pc_children > 0)
+    weights = pc_children[demand]
+    tree = cKDTree(school_xy)
+    dist, cand = tree.query(pc_xy[demand], k=k, workers=-1)
+    if k == 1:
+        dist = dist[:, None]
+        cand = cand[:, None]
+    eff = dist - bonus_km[cand]
+
+    rows = np.arange(len(demand))
+    cutoff = np.full(n_schools, np.inf)
+    for _ in range(max_iter):
+        eff_feasible = np.where(dist <= cutoff[cand], eff, np.inf)
+        if tau_km <= 0:
+            choice = np.argmin(eff_feasible, axis=1)
+            valid = np.isfinite(eff_feasible[rows, choice])
+            chosen_school = cand[rows[valid], choice[valid]]
+            chosen_dist = dist[rows[valid], choice[valid]]
+            chosen_mass = weights[valid]
+        else:
+            z = -eff_feasible / tau_km
+            z_max = z.max(axis=1, keepdims=True)
+            share = np.exp(z - np.where(np.isfinite(z_max), z_max, 0.0))
+            share[~np.isfinite(eff_feasible)] = 0.0
+            total = share.sum(axis=1, keepdims=True)
+            mass = weights[:, None] * share / np.where(total > 0, total, 1.0)
+            # Sub-thousandth-of-a-child applications only slow the sort down.
+            keep = mass > 1e-3
+            chosen_school = cand[keep]
+            chosen_dist = dist[keep]
+            chosen_mass = mass[keep]
+
+        order = np.lexsort((chosen_dist, chosen_school))
+        s_sorted = chosen_school[order]
+        d_sorted = chosen_dist[order]
+        m_cum = np.cumsum(chosen_mass[order])
+        boundaries = np.flatnonzero(np.diff(s_sorted)) + 1
+        starts = np.concatenate(([0], boundaries))
+        ends = np.concatenate((boundaries, [len(s_sorted)]))
+
+        changed = False
+        for start, end in zip(starts, ends):
+            school = s_sorted[start]
+            seg_cum = m_cum[start:end] - (m_cum[start - 1] if start else 0.0)
+            if seg_cum[-1] <= fill_target[school]:
+                continue
+            marginal = d_sorted[start + np.searchsorted(seg_cum, fill_target[school])]
+            if marginal < cutoff[school]:
+                cutoff[school] = marginal
+                changed = True
+        if not changed:
+            break
+
+    return cutoff
+
+
+def capacity_fill_radii(
+    school_xy: np.ndarray,
+    fill_target: np.ndarray,
+    pc_xy: np.ndarray,
+    pc_children: np.ndarray,
+    max_radius_km: float = MAX_RADIUS_KM,
+) -> np.ndarray:
+    """Feasibility radius for schools without a binding cutoff.
+
+    An undersubscribed school admits anyone who applies, so its catchment is
+    bounded by plausibility rather than competition: the distance within
+    which the local child population would cover its fill target. Capped at
+    ``max_radius_km``.
+    """
+    demand = np.flatnonzero(pc_children > 0)
+    tree = cKDTree(pc_xy[demand])
+    radii = np.full(len(school_xy), max_radius_km)
+    k = min(4096, len(demand))
+    for i in range(len(school_xy)):
+        dists, idx = tree.query(
+            school_xy[i], k=k, distance_upper_bound=max_radius_km
+        )
+        found = np.isfinite(dists)
+        cum = np.cumsum(pc_children[demand[idx[found]]])
+        if len(cum) and cum[-1] >= fill_target[i]:
+            radii[i] = dists[found][np.searchsorted(cum, fill_target[i])]
+    return radii
+
+
+def count_covering_catchments(
+    pc_xy: np.ndarray,
+    pc_valid: np.ndarray,
+    school_xy: np.ndarray,
+    school_radii: np.ndarray,
+    n_postcodes: int,
+) -> np.ndarray:
+    """Count, per postcode, how many schools' catchment radii cover it."""
+    counts = np.zeros(n_postcodes, dtype=np.int32)
+    if len(school_xy) == 0:
+        return counts
+    valid_indices = np.flatnonzero(pc_valid)
+    tree = cKDTree(pc_xy[valid_indices])
+    covered = np.zeros(len(valid_indices), dtype=np.int32)
+    for indices in tree.query_ball_point(school_xy, school_radii, workers=-1):
+        covered[indices] += 1
+    counts[valid_indices] = covered
+    return counts
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=(
+            "Model school admission cutoff radii and count good+/outstanding "
+            "primary/secondary catchments covering each postcode"
+        )
+    )
+    parser.add_argument(
+        "--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
+    )
+    parser.add_argument(
+        "--gias", type=Path, required=True, help="GIAS open-school parquet"
+    )
+    parser.add_argument(
+        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
+    )
+    parser.add_argument(
+        "--lsoa-children",
+        type=Path,
+        required=True,
+        help="Census 2021 children by LSOA parquet",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Per-postcode counts parquet; omit for calibration runs that only "
+        "need --schools-output",
+    )
+    parser.add_argument(
+        "--schools-output",
+        type=Path,
+        default=None,
+        help="Optional per-school catchment radii parquet (for calibration/debugging)",
+    )
+    parser.add_argument(
+        "--bonus-outstanding-km",
+        type=float,
+        default=PREF_BONUS_OUTSTANDING_KM,
+        help="Preference bonus for Outstanding schools (calibration sweeps)",
+    )
+    parser.add_argument(
+        "--bonus-good-km",
+        type=float,
+        default=PREF_BONUS_GOOD_KM,
+        help="Preference bonus for Good schools (calibration sweeps)",
+    )
+    parser.add_argument(
+        "--demand-scale",
+        type=float,
+        default=DEMAND_SCALE,
+        help="Share of resident children competing for state places",
+    )
+    parser.add_argument(
+        "--choice-temperature-km",
+        type=float,
+        default=CHOICE_TEMPERATURE_KM,
+        help="Logit choice temperature over effective distance",
+    )
+    args = parser.parse_args()
+
+    gias = pl.read_parquet(args.gias)
+    open_urns = set(
+        gias.select(pl.col("urn").cast(pl.Int64, strict=False))
+        .to_series()
+        .drop_nulls()
+        .to_list()
+    )
+    print(f"GIAS open register: {len(open_urns):,} open school URNs")
+
+    ofsted = pl.read_parquet(args.ofsted)
+    rated = classify_good_plus_schools(ofsted, open_urns=open_urns)
+    if rated.is_empty():
+        raise ValueError("No good+ primary/secondary Ofsted schools found")
+    print(f"Good+ school/phase rows: {len(rated):,}")
+
+    supply = phase_intakes(gias).join(
+        school_preference_bonuses(
+            ofsted,
+            bonus_outstanding_km=args.bonus_outstanding_km,
+            bonus_good_km=args.bonus_good_km,
+        ),
+        on="urn",
+        how="left",
+    ).with_columns(pl.col("bonus_km").fill_null(0.0))
+    print(f"State schools in admissions model: {len(supply):,}")
+
+    arcgis = pl.read_parquet(args.arcgis).select(
+        pl.col("pcds").alias("postcode"),
+        "lat",
+        pl.col("long").alias("lng"),
+        "lsoa21cd",
+        "doterm",
+    )
+    live = arcgis.filter(
+        pl.col("doterm").is_null() & pl.col("lsoa21cd").str.starts_with("E")
+    )
+    demand = children_per_postcode(live, pl.read_parquet(args.lsoa_children))
+    print(
+        f"Demand postcodes: {len(demand):,} "
+        f"({demand['primary_children'].sum():,.0f} primary-age, "
+        f"{demand['secondary_children'].sum():,.0f} secondary-age children)"
+    )
+
+    # Shared local-km projection so assignment and coverage use one metric.
+    pc_lats = arcgis["lat"].to_numpy()
+    pc_lngs = arcgis["lng"].to_numpy()
+    pc_valid = valid_uk_coords_mask(pc_lats, pc_lngs)
+    origin_lat = float(np.mean(pc_lats[pc_valid]))
+    pc_xy = _project_lat_lng_km(pc_lats, pc_lngs, origin_lat)
+
+    demand_lats = demand["lat"].to_numpy()
+    demand_lngs = demand["lng"].to_numpy()
+    demand_valid = valid_uk_coords_mask(demand_lats, demand_lngs)
+    demand_xy = _project_lat_lng_km(demand_lats, demand_lngs, origin_lat)
+
+    school_xy = _project_lat_lng_km(
+        supply["lat"].to_numpy(), supply["lng"].to_numpy(), origin_lat
+    )
+
+    radii = {}
+    for phase in ("primary", "secondary"):
+        in_phase = supply[f"{phase}_intake"].to_numpy() > 0
+        targets = supply[f"{phase}_intake"].to_numpy()[in_phase]
+        xy = school_xy[in_phase]
+        children = np.where(
+            demand_valid,
+            demand[f"{phase}_children"].to_numpy() * args.demand_scale,
+            0.0,
+        )
+        print(f"Solving {phase} admissions for {in_phase.sum():,} schools...")
+        cutoffs = equilibrium_cutoffs(
+            xy,
+            targets,
+            supply["bonus_km"].to_numpy()[in_phase],
+            demand_xy,
+            children,
+            tau_km=args.choice_temperature_km,
+        )
+        filled = np.isfinite(cutoffs)
+        print(
+            f"  {filled.sum():,} schools have binding cutoffs "
+            f"(median {np.median(cutoffs[filled]):.2f} km); "
+            f"{(~filled).sum():,} undersubscribed"
+        )
+        fallback = capacity_fill_radii(
+            xy[~filled], targets[~filled], demand_xy, children
+        )
+        raw = cutoffs.copy()
+        raw[~filled] = fallback
+        radii[phase] = pl.DataFrame(
+            {
+                "urn": supply["urn"].to_numpy()[in_phase],
+                "phase": phase,
+                "cutoff_km": raw,
+                "filled": filled,
+                "radius_km": np.clip(
+                    raw * CUTOFF_CALIBRATION_FACTOR[phase],
+                    MIN_RADIUS_KM,
+                    MAX_RADIUS_KM,
+                ),
+            }
+        )
+        print(
+            f"  radius km: median {radii[phase]['radius_km'].median():.2f}, "
+            f"p90 {radii[phase]['radius_km'].quantile(0.9):.2f}"
+        )
+
+    # Attach each rated school's phase radius; rated schools outside the
+    # admissions model (special schools, selective schools, missing
+    # headcounts) cannot be given a defensible radius and are dropped.
+    rated = rated.with_columns(
+        pl.col("category").str.split("_").list.get(1).alias("phase")
+    )
+    rated_with_radius = rated.join(
+        pl.concat(list(radii.values())), on=["urn", "phase"], how="inner"
+    ).join(supply.select("urn", "lat", "lng"), on="urn", how="inner")
+    dropped = len(rated) - len(rated_with_radius)
+    print(
+        f"Rated school/phase rows with radii: {len(rated_with_radius):,} "
+        f"(dropped {dropped:,}, incl. selective schools)"
+    )
+
+    if args.output is None and args.schools_output is None:
+        raise SystemExit("Provide --output and/or --schools-output")
+
+    if args.output is not None:
+        category_counts = {}
+        for category in set(c for cats in SCHOOL_GROUPS.values() for c in cats):
+            cat = rated_with_radius.filter(pl.col("category") == category)
+            cat_xy = _project_lat_lng_km(
+                cat["lat"].to_numpy(), cat["lng"].to_numpy(), origin_lat
+            )
+            category_counts[category] = count_covering_catchments(
+                pc_xy, pc_valid, cat_xy, cat["radius_km"].to_numpy(), len(arcgis)
+            )
+            print(f"  {category}: {len(cat):,} schools")
+
+        result = pl.DataFrame(
+            {
+                "postcode": arcgis["postcode"],
+                **{
+                    f"{group}_catchments": sum(category_counts[c] for c in categories)
+                    for group, categories in SCHOOL_GROUPS.items()
+                },
+            }
+        )
+        for group in SCHOOL_GROUPS:
+            col = result[f"{group}_catchments"]
+            print(f"  {group}_catchments: mean {col.mean():.2f}, max {col.max()}")
+
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        result.write_parquet(args.output)
+        size_mb = args.output.stat().st_size / (1024 * 1024)
+        print(f"Wrote {args.output} ({size_mb:.1f} MB)")
+
+    if args.schools_output is not None:
+        schools_out = rated_with_radius.select(
+            "urn", "category", "phase", "cutoff_km", "filled", "radius_km", "lat", "lng"
+        )
+        args.schools_output.parent.mkdir(parents=True, exist_ok=True)
+        schools_out.write_parquet(args.schools_output)
+        print(f"Wrote {args.schools_output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/pipeline/transform/school_proximity.py
+++ b/pipeline/transform/school_proximity.py
@ -1,199 +0,0 @@
-"""Compute Ofsted-rated school proximity counts per postcode."""
-
-import argparse
-from pathlib import Path
-
-import polars as pl
-
-from pipeline.utils.poi_counts import count_pois_per_postcode
-
-SCHOOL_GROUPS = {
-    "good_primary": ["good_primary", "outstanding_primary"],
-    "good_secondary": ["good_secondary", "outstanding_secondary"],
-    "outstanding_primary": ["outstanding_primary"],
-    "outstanding_secondary": ["outstanding_secondary"],
-}
-
-
-# Age thresholds for deciding which phase(s) a school serves. A school serves
-# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
-# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
-# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
-# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
-# phase" labels such schools as just "Secondary", which previously hid them from
-# every postcode's primary-school count.
-PRIMARY_MAX_AGE = 10
-SECONDARY_MIN_AGE = 12
-
-
-def classify_good_plus_schools(
-    ofsted: pl.DataFrame, open_urns: set[int] | None = None
-) -> pl.DataFrame:
-    """Label good+/outstanding primary & secondary schools for proximity counts.
-
-    Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
-    ``category`` rows per school, returning a ``(postcode, category)`` frame.
-
-    Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
-    overall effectiveness" (OEIF = the previous Ofsted Education Inspection
-    Framework). A large and growing share of schools were last inspected under an
-    UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
-    that column is null/"Not judged" for them even when they are demonstrably
-    good — their status lives in "Ungraded inspection overall outcome" ("School
-    remains Good"/"School remains Outstanding"). Filtering on the graded column
-    alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
-    ungraded outcome, but ONLY when there is no usable graded result
-    (null/"Not judged"), so a genuine grade 3/4 is never overridden.
-
-    Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
-    (Concerns)" outcome signals inspectors found issues warranting an earlier
-    graded re-inspection, so marketing it as a good+ school is misleading.
-
-    Phase assignment uses the statutory age range when available (so all-through
-    and middle schools count toward BOTH primary and secondary), falling back to
-    the coarse "Ofsted phase" label when age columns are absent. When
-    ``open_urns`` is given, schools whose URN is not in the current GIAS open
-    register are dropped so closed/merged schools are not counted.
-    """
-    # Cast to Utf8 so the string predicates below are well-defined even if a
-    # column happens to be entirely null (read back as a Null dtype).
-    oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
-    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
-    no_usable_grade = oeif.is_null() | (oeif == "Not judged")
-    has_concern = ungraded.str.contains(r"\(Concerns\)")
-    remains_outstanding = (
-        ungraded.str.starts_with("School remains Outstanding") & ~has_concern
-    )
-    remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
-    graded = (
-        ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
-        .with_columns(
-            pl.when(oeif.is_in(["1", "2"]))
-            .then(oeif)
-            .when(no_usable_grade & remains_outstanding)
-            .then(pl.lit("1"))
-            .when(no_usable_grade & remains_good)
-            .then(pl.lit("2"))
-            .otherwise(None)
-            .alias("_ofsted_grade")
-        )
-        .filter(pl.col("_ofsted_grade").is_not_null())
-    )
-
-    # Drop schools no longer open (closed/merged) when the GIAS open register is
-    # provided, so stale Ofsted "latest inspection" rows are not counted.
-    if open_urns is not None and "URN" in graded.columns:
-        graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
-
-    # Decide which phase(s) each school serves.
-    if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
-        low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
-        high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
-        serves_primary = (
-            pl.when(low.is_not_null())
-            .then(low <= PRIMARY_MAX_AGE)
-            .otherwise(pl.col("Ofsted phase") == "Primary")
-        )
-        serves_secondary = (
-            pl.when(high.is_not_null())
-            .then(high >= SECONDARY_MIN_AGE)
-            .otherwise(pl.col("Ofsted phase") == "Secondary")
-        )
-    else:
-        serves_primary = pl.col("Ofsted phase") == "Primary"
-        serves_secondary = pl.col("Ofsted phase") == "Secondary"
-
-    graded = graded.with_columns(
-        serves_primary.alias("_serves_primary"),
-        serves_secondary.alias("_serves_secondary"),
-    )
-
-    # Good+ groups include both grade variants; outstanding groups count grade 1.
-    # A school can yield up to two rows (primary and secondary).
-    primary = graded.filter(pl.col("_serves_primary")).with_columns(
-        pl.when(pl.col("_ofsted_grade") == "1")
-        .then(pl.lit("outstanding_primary"))
-        .otherwise(pl.lit("good_primary"))
-        .alias("category")
-    )
-    secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
-        pl.when(pl.col("_ofsted_grade") == "1")
-        .then(pl.lit("outstanding_secondary"))
-        .otherwise(pl.lit("good_secondary"))
-        .alias("category")
-    )
-    return pl.concat([primary, secondary]).select(
-        pl.col("Postcode").alias("postcode"),
-        "category",
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Count good+ and outstanding primary/secondary schools near each postcode"
-    )
-    parser.add_argument(
-        "--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
-    )
-    parser.add_argument(
-        "--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
-    )
-    parser.add_argument(
-        "--gias",
-        type=Path,
-        default=None,
-        help="GIAS open-school parquet; if given, only currently-open schools are counted",
-    )
-    parser.add_argument(
-        "--output", type=Path, required=True, help="Output parquet path"
-    )
-    args = parser.parse_args()
-
-    open_urns: set[int] | None = None
-    if args.gias is not None:
-        gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
-        open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
-        print(f"GIAS open register: {len(open_urns):,} open school URNs")
-
-    ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
-    if ofsted.is_empty():
-        raise ValueError("No good+ primary/secondary Ofsted schools found")
-
-    print(f"Good+ schools: {len(ofsted):,}")
-    print(
-        "Outstanding schools: "
-        f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
-    )
-
-    # Join with arcgis to get lat/lng for each school's postcode
-    arcgis = pl.read_parquet(args.arcgis).select(
-        pl.col("pcds").alias("postcode"),
-        "lat",
-        pl.col("long").alias("lng"),
-    )
-
-    schools = ofsted.join(arcgis, on="postcode", how="inner")
-    if schools.is_empty():
-        raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
-    print(f"Schools with coordinates: {len(schools):,}")
-
-    # Load all postcodes for proximity counting
-    postcodes = arcgis.rename({"lng": "lon"})
-
-    counts_5km = count_pois_per_postcode(
-        postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
-    )
-    counts_2km = count_pois_per_postcode(
-        postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
-    )
-
-    result = counts_5km.join(counts_2km, on="postcode")
-
-    args.output.parent.mkdir(parents=True, exist_ok=True)
-    result.write_parquet(args.output)
-    size_mb = args.output.stat().st_size / (1024 * 1024)
-    print(f"Wrote {args.output} ({size_mb:.1f} MB)")
-
-
-if __name__ == "__main__":
-    main()
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -8,6 +8,7 @@ import polars as pl

 from pipeline.transform.join_epc_pp import (
    EPC_SOURCE_COLUMNS,
+    _join_address_parts,
    _run,
    _scan_epc_certificates,
 )
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
    assert df.schema["number_habitable_rooms"] == pl.Int16


+def test_join_address_parts_empty_string_components():
+    # Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
+    # concat_str(ignore_nulls=True) alone leaked the separator into the
+    # display address (' 10 PALACE GREEN') and doubled it for empty middle
+    # components. Empty/whitespace-only parts must contribute nothing.
+    df = pl.DataFrame(
+        {
+            "saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, "  ", " FLAT 2"],
+            "paon": ["10", "10", "", "82", "", None, "10", "11 "],
+            "street": [
+                "PALACE GREEN",
+                "HIGH STREET",
+                "HIGH STREET",
+                "",
+                "",
+                None,
+                "PALACE GREEN",
+                "STATION ROAD",
+            ],
+        }
+    )
+    out = df.select(
+        _join_address_parts("saon", "paon", "street").alias("address")
+    ).get_column("address")
+
+    assert out.to_list() == [
+        "10 PALACE GREEN",  # empty saon -> no leading space
+        "FLAT 1 10 HIGH STREET",  # normal three-part address is unchanged
+        "FLAT 1 HIGH STREET",  # empty middle component -> no double space
+        "FLAT 21 82",  # empty street -> no trailing space
+        None,  # all-empty -> null, not whitespace junk
+        None,  # all-null -> null
+        "10 PALACE GREEN",  # whitespace-only component treated as empty
+        "FLAT 2 11 STATION ROAD",  # per-component padding is stripped
+    ]
+    # Invariant: every produced address is trimmed and single-spaced.
+    produced = out.drop_nulls()
+    assert produced.str.starts_with(" ").sum() == 0
+    assert produced.str.ends_with(" ").sum() == 0
+    assert produced.str.contains("  ", literal=True).sum() == 0
+
+
+def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
+    # Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
+    # published pp_address must not inherit a leading separator from it.
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000],
+            "date_of_transfer": [date(2024, 2, 3)],
+            "property_type": ["T"],
+            "postcode": ["AA1 1AA"],
+            "paon": ["1"],
+            "saon": [""],
+            "street": ["Example Street"],
+            "locality": [""],
+            "town_city": ["Exampletown"],
+            "duration": ["F"],
+            "old_new": ["N"],
+            "ppd_category": ["A"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    # No leading space, and the clean address still matches its EPC record.
+    assert df.select("pp_address", "epc_address").to_dicts() == [
+        {"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
+    ]
+
+
 def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    zip_path = tmp_path / "domestic-csv.zip"
    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -304,7 +304,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
        poi_counts=_by_postcode({}),
        noise=_by_postcode({}),
-        school_proximity=_by_postcode({}),
+        school_catchments=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
        broadband=pl.LazyFrame(
@ -362,7 +362,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
        poi_counts=_by_postcode({}),
        noise=_by_postcode({}),
-        school_proximity=_by_postcode({}),
+        school_catchments=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
        broadband=broadband,
@ -1057,7 +1057,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
        election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
        poi_counts=_by_postcode({}),
        noise=_by_postcode({}),
-        school_proximity=_by_postcode({}),
+        school_catchments=_by_postcode({}),
        conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
        tree_density=None,
        broadband=pl.LazyFrame(
--- a/pipeline/transform/test_poi_proximity.py
+++ b/pipeline/transform/test_poi_proximity.py
@ -1,9 +1,11 @@
 import polars as pl

 from pipeline.transform.poi_proximity import (
+    GREENSPACE_PARK_FUNCTIONS,
    POI_GROUPS_2KM,
    _build_poi_category_groups,
    _dynamic_poi_metric_renames,
+    _greenspace_count_frame,
    _groceries_categories,
 )
 from pipeline.utils.poi_counts import count_pois_per_postcode
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
        "parks_2km": "Number of amenities (Park) within 2km",
        "parks_5km": "Number of amenities (Park) within 5km",
    }
+
+
+def test_groceries_categories_exclude_speciality_food_retail() -> None:
+    """The static groceries metric must not count bakeries/butchers/delis/
+    off-licences (speciality retail, ~a third of the group), while keeping
+    Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
+    pois = pl.DataFrame(
+        {
+            "category": [
+                "Tesco",
+                "Supermarket",
+                "Convenience Store",
+                "Greengrocer",
+                "Bakery",
+                "Butcher & Fishmonger",
+                "Deli & Specialty",
+                "Off-Licence",
+                "Café",
+            ],
+            "group": ["Groceries"] * 8 + ["Leisure"],
+            "lat": [51.5] * 9,
+            "lng": [-0.1] * 9,
+        }
+    )
+
+    assert _groceries_categories(pois) == [
+        "Convenience Store",
+        "Greengrocer",
+        "Supermarket",
+        "Tesco",
+    ]
+
+
+def test_park_group_excludes_playgrounds_and_play_space() -> None:
+    # "Play Space" (playgrounds) must not count as a Park; Public Park Or
+    # Garden and Playing Field (open recreation grounds) are in scope.
+    assert GREENSPACE_PARK_FUNCTIONS == {
+        "parks": ["Public Park Or Garden", "Playing Field"]
+    }
+
+
+def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
+    # Three gates of one park (with a site centroid), one gate of another park
+    # without a centroid, and one centroid-fallback row with a null site_id.
+    greenspace = pl.DataFrame(
+        {
+            "lat": [51.50, 51.51, 51.52, 53.0, 54.0],
+            "lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
+            "category": ["Public Park Or Garden"] * 3
+            + ["Playing Field", "Public Park Or Garden"],
+            "site_id": ["site-a", "site-a", "site-a", "site-b", None],
+            "site_lat": [51.505, 51.505, 51.505, None, None],
+            "site_lng": [-0.105, -0.105, -0.105, None, None],
+        }
+    )
+
+    result = _greenspace_count_frame(greenspace).sort("lat")
+
+    # One row per site (site-a collapses 3 → 1), null-site rows preserved.
+    assert result.height == 3
+    site_a = result.filter(pl.col("site_id") == "site-a")
+    # The representative point is the site centroid…
+    assert site_a["lat"].to_list() == [51.505]
+    assert site_a["lng"].to_list() == [-0.105]
+    # …or the first access point when no centroid is available.
+    site_b = result.filter(pl.col("site_id") == "site-b")
+    assert site_b["lat"].to_list() == [53.0]
+
+
+def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
+    # The shipped parquet predates the site_id column; counting must not crash
+    # (it keeps the old access-point grain until regenerated).
+    legacy = pl.DataFrame(
+        {
+            "lat": [51.50, 51.51],
+            "lng": [-0.10, -0.11],
+            "category": ["Public Park Or Garden", "Play Space"],
+        }
+    )
+
+    assert _greenspace_count_frame(legacy).equals(legacy)
--- a/pipeline/transform/test_school_catchments.py
+++ b/pipeline/transform/test_school_catchments.py
@ -0,0 +1,354 @@
+import numpy as np
+import polars as pl
+
+from pipeline.transform.school_catchments import (
+    capacity_fill_radii,
+    children_per_postcode,
+    classify_good_plus_schools,
+    count_covering_catchments,
+    equilibrium_cutoffs,
+    phase_intakes,
+    school_preference_bonuses,
+)
+
+
+def _school(phase, oeif, ungraded, urn=100000):
+    return {
+        "URN": urn,
+        "Postcode": "AA1 1AA",
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": ungraded,
+    }
+
+
+def _classify(rows):
+    result = classify_good_plus_schools(pl.DataFrame(rows))
+    return {(r["urn"], r["category"]) for r in result.to_dicts()}
+
+
+def test_legacy_oeif_grades_1_and_2_are_kept():
+    rows = [
+        _school("Primary", "1", None, 1),
+        _school("Primary", "2", None, 2),
+        _school("Secondary", "1", None, 3),
+        _school("Secondary", "2", None, 4),
+    ]
+    assert _classify(rows) == {
+        (1, "outstanding_primary"),
+        (2, "good_primary"),
+        (3, "outstanding_secondary"),
+        (4, "good_secondary"),
+    }
+
+
+def test_grades_3_and_4_are_excluded():
+    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_remains_good_is_recovered_when_no_graded_result():
+    # Null and "Not judged" OEIF fall back to the ungraded outcome.
+    rows = [
+        _school("Primary", None, "School remains Good", 1),
+        _school("Secondary", "Not judged", "School remains Outstanding", 2),
+        # "(Improving)" is still good+ ...
+        _school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
+    ]
+    assert _classify(rows) == {
+        (1, "good_primary"),
+        (2, "outstanding_secondary"),
+        (3, "good_primary"),
+    }
+
+
+def test_ungraded_concerns_are_not_good_plus():
+    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
+    # must NOT be counted as good+ schools.
+    rows = [
+        _school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
+        _school(
+            "Secondary",
+            None,
+            "School remains Outstanding (Concerns) - S5 Next",
+            2,
+        ),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_ungraded_non_good_outcomes_are_excluded():
+    rows = [
+        _school("Primary", None, "Some aspects not as strong"),
+        _school("Primary", None, "Standards maintained"),
+        _school("Primary", None, None),
+    ]
+    assert _classify(rows) == set()
+
+
+def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
+    # A real grade 3 must not be promoted by an ungraded "remains Good".
+    rows = [_school("Primary", "3", "School remains Good")]
+    assert _classify(rows) == set()
+
+
+def test_non_primary_secondary_phases_excluded():
+    rows = [
+        _school("Nursery", "1", None),
+        _school("Not applicable", "2", None),
+    ]
+    assert _classify(rows) == set()
+
+
+def _aged_school(phase, oeif, low, high, urn=100000):
+    return {
+        "URN": urn,
+        "Postcode": "AA1 1AA",
+        "Ofsted phase": phase,
+        "Latest OEIF overall effectiveness": oeif,
+        "Ungraded inspection overall outcome": None,
+        "Statutory lowest age": low,
+        "Statutory highest age": high,
+    }
+
+
+def test_all_through_school_counts_toward_both_primary_and_secondary():
+    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
+    # serves primary-age children too, so it must count in BOTH metrics.
+    rows = [_aged_school("Secondary", "2", 3, 18, 1)]
+    assert _classify(rows) == {
+        (1, "good_primary"),
+        (1, "good_secondary"),
+    }
+
+
+def test_age_ranges_assign_single_phase_for_standard_schools():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, 1),  # primary only
+        _aged_school("Secondary", "2", 11, 16, 2),  # secondary only
+        _aged_school("Secondary", "1", 9, 13, 3),  # middle -> both
+    ]
+    assert _classify(rows) == {
+        (1, "outstanding_primary"),
+        (2, "good_secondary"),
+        (3, "outstanding_primary"),
+        (3, "outstanding_secondary"),
+    }
+
+
+def test_closed_schools_excluded_when_open_register_given():
+    rows = [
+        _aged_school("Primary", "1", 4, 11, 111),
+        _aged_school("Secondary", "2", 11, 16, 222),
+    ]
+    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
+    pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
+    # URN 222 is not in the open register, so it is dropped.
+    assert pairs == {(111, "outstanding_primary")}
+
+
+def _gias_row(
+    urn,
+    type_group="Academies",
+    age_range="4–11",
+    pupils=210,
+    capacity=None,
+    admissions_policy=None,
+):
+    return {
+        "urn": urn,
+        "name": f"School {urn}",
+        "lat": 51.5,
+        "lng": -0.1,
+        "type_group": type_group,
+        "age_range": age_range,
+        "pupils": pupils,
+        "capacity": capacity,
+        "admissions_policy": admissions_policy,
+    }
+
+
+def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                # 4-11 = cohorts 4..10, all 7 primary: full fill target.
+                _gias_row(1, age_range="4–11", pupils=210),
+                # 11-16 = cohorts 11..15, all 5 secondary.
+                _gias_row(2, age_range="11–16", pupils=500),
+                # 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
+                # gets 7 of 7.5 cohort weights.
+                _gias_row(3, age_range="3–11", pupils=240),
+                # All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
+                _gias_row(4, age_range="4–16", pupils=1200),
+                # 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
+                # secondary gets 5 of 6.2 cohort weights.
+                _gias_row(5, age_range="11–18", pupils=1240),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
+    assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
+
+
+def test_phase_intakes_excludes_non_state_and_selective_schools():
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                _gias_row(1, type_group="Independent schools"),
+                _gias_row(2, type_group="Special schools"),
+                _gias_row(3, type_group="Welsh schools"),
+                # Grammar school intakes are test-based and region-wide; a
+                # distance catchment would be fabricated.
+                _gias_row(4, admissions_policy="Selective"),
+                _gias_row(5, pupils=None, capacity=300),
+                _gias_row(6, pupils=None, capacity=None),  # no usable headcount
+                _gias_row(7, age_range=None),  # no parsable cohorts
+                # Over-full school keeps its demonstrated size.
+                _gias_row(8, pupils=350, capacity=300),
+                _gias_row(9, admissions_policy="Non-selective"),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["urn"].to_list() == [5, 8, 9]
+    assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
+
+
+def test_school_preference_bonuses_follow_derived_grade():
+    rows = [
+        {**_school("Primary", "1", None, 1)},
+        {**_school("Primary", "2", None, 2)},
+        {**_school("Primary", "3", None, 3)},
+        {**_school("Primary", "4", None, 4)},
+        {**_school("Primary", None, "Some aspects not as strong", 5)},  # unrated
+        {**_school("Primary", "Not judged", "School remains Good", 6)},
+    ]
+    bonuses = dict(
+        school_preference_bonuses(
+            pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
+        ).iter_rows()
+    )
+    assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
+
+
+def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
+    postcodes = pl.DataFrame(
+        {
+            "postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
+            "lat": [51.5, 51.5, 52.0],
+            "lng": [-0.1, -0.1, -0.2],
+            "lsoa21cd": ["E01000001", "E01000001", "E01000002"],
+        }
+    )
+    lsoa_children = pl.DataFrame(
+        {
+            "lsoa21": ["E01000001", "E01000002"],
+            "aged_0_4": [100, 30],
+            "aged_5_9": [100, 10],
+            "aged_10_14": [100, 20],
+            "aged_15_19": [100, 40],
+        }
+    )
+    result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
+    # Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
+    # the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
+    assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
+    # Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
+    assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
+
+
+def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
+    # One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
+    # each. The two nearest postcodes exactly fill it, so the cutoff is the
+    # marginal admitted child's distance and the 3km postcode is shut out.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0]]),
+        np.array([10.0]),
+        np.array([0.0]),
+        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
+        np.array([5.0, 5.0, 5.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs.tolist() == [2.0]
+
+
+def test_equilibrium_rejected_demand_cascades_to_next_school():
+    # School A (5 places) at the origin, school B (5 places) at 10km.
+    # P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
+    # with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
+    # exceeds its target, so it keeps no binding cutoff.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [10.0, 0.0]]),
+        np.array([5.0, 5.0]),
+        np.array([0.0, 0.0]),
+        np.array([[1.0, 0.0], [1.5, 0.0]]),
+        np.array([5.0, 5.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs[0] == 1.0
+    assert np.isinf(cutoffs[1])
+
+
+def test_equilibrium_preference_bonus_steers_demand_to_better_school():
+    # Two schools equidistant from the only postcode; school A is rated
+    # better (0.5km bonus) so all children choose it; B attracts nobody.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [2.0, 0.0]]),
+        np.array([5.0, 5.0]),
+        np.array([0.5, 0.0]),
+        np.array([[1.0, 0.0]]),
+        np.array([10.0]),
+        tau_km=0.0,
+    )
+    assert cutoffs[0] == 1.0
+    assert np.isinf(cutoffs[1])
+
+
+def test_equilibrium_logit_choice_smears_demand_across_schools():
+    # With a positive temperature some families prefer the further school, so
+    # both schools receive applications: the near school still fills and keeps
+    # a binding cutoff, and the far school now attracts mass it would never
+    # see under deterministic choice.
+    cutoffs = equilibrium_cutoffs(
+        np.array([[0.0, 0.0], [2.0, 0.0]]),
+        np.array([4.0, 4.0]),
+        np.array([0.0, 0.0]),
+        np.array([[1.0, 0.0]]),
+        np.array([10.0]),
+        tau_km=1.0,
+    )
+    # Each school gets half the 10 children (equidistant, equal utility),
+    # exceeding both fill targets: both cutoffs bind at the postcode.
+    assert cutoffs.tolist() == [1.0, 1.0]
+
+
+def test_capacity_fill_radii_covers_fill_target_population():
+    # Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
+    # cumulate past the target at 2km. A school needing more children than
+    # exist within the cap keeps the cap.
+    radii = capacity_fill_radii(
+        np.array([[0.0, 0.0], [0.0, 0.0]]),
+        np.array([6.0, 1000.0]),
+        np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
+        np.array([5.0, 5.0, 5.0]),
+        max_radius_km=25.0,
+    )
+    assert radii.tolist() == [2.0, 25.0]
+
+
+def test_count_covering_catchments_respects_radius_and_validity():
+    pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
+    pc_valid = np.array([True, True, True, False])
+    school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
+    radii = np.array([4.0, 1.5])
+    counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
+    # pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
+    # pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
+    assert counts.tolist() == [1, 2, 0, 0]
+
+
+def test_count_covering_catchments_empty_schools():
+    counts = count_covering_catchments(
+        np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
+    )
+    assert counts.tolist() == [0, 0]
--- a/pipeline/transform/test_school_proximity.py
+++ b/pipeline/transform/test_school_proximity.py
@ -1,139 +0,0 @@
-import polars as pl
-
-from pipeline.transform.school_proximity import classify_good_plus_schools
-
-
-def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
-    return {
-        "Postcode": postcode,
-        "Ofsted phase": phase,
-        "Latest OEIF overall effectiveness": oeif,
-        "Ungraded inspection overall outcome": ungraded,
-    }
-
-
-def _classify(rows):
-    result = classify_good_plus_schools(pl.DataFrame(rows))
-    return {(r["postcode"], r["category"]) for r in result.to_dicts()}
-
-
-def test_legacy_oeif_grades_1_and_2_are_kept():
-    rows = [
-        _school("Primary", "1", None, "AA1 1AA"),
-        _school("Primary", "2", None, "AA1 1AB"),
-        _school("Secondary", "1", None, "AA1 1AC"),
-        _school("Secondary", "2", None, "AA1 1AD"),
-    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "outstanding_primary"),
-        ("AA1 1AB", "good_primary"),
-        ("AA1 1AC", "outstanding_secondary"),
-        ("AA1 1AD", "good_secondary"),
-    }
-
-
-def test_grades_3_and_4_are_excluded():
-    rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
-    assert _classify(rows) == set()
-
-
-def test_ungraded_remains_good_is_recovered_when_no_graded_result():
-    # Null and "Not judged" OEIF fall back to the ungraded outcome.
-    rows = [
-        _school("Primary", None, "School remains Good", "AA1 1AA"),
-        _school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
-        # "(Improving)" is still good+ ...
-        _school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
-    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "good_primary"),
-        ("AA1 1AB", "outstanding_secondary"),
-        ("AA1 1AE", "good_primary"),
-    }
-
-
-def test_ungraded_concerns_are_not_good_plus():
-    # "(Concerns)" outcomes signal issues warranting earlier re-inspection and
-    # must NOT be counted as good+ schools.
-    rows = [
-        _school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
-        _school(
-            "Secondary",
-            None,
-            "School remains Outstanding (Concerns) - S5 Next",
-            "AA1 1AD",
-        ),
-    ]
-    assert _classify(rows) == set()
-
-
-def test_ungraded_non_good_outcomes_are_excluded():
-    rows = [
-        _school("Primary", None, "Some aspects not as strong"),
-        _school("Primary", None, "Standards maintained"),
-        _school("Primary", None, None),
-    ]
-    assert _classify(rows) == set()
-
-
-def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
-    # A real grade 3 must not be promoted by an ungraded "remains Good".
-    rows = [_school("Primary", "3", "School remains Good")]
-    assert _classify(rows) == set()
-
-
-def test_non_primary_secondary_phases_excluded():
-    rows = [
-        _school("Nursery", "1", None),
-        _school("Not applicable", "2", None),
-    ]
-    assert _classify(rows) == set()
-
-
-def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
-    return {
-        "Postcode": postcode,
-        "Ofsted phase": phase,
-        "Latest OEIF overall effectiveness": oeif,
-        "Ungraded inspection overall outcome": None,
-        "URN": 100000,
-        "Statutory lowest age": low,
-        "Statutory highest age": high,
-    }
-
-
-def test_all_through_school_counts_toward_both_primary_and_secondary():
-    # An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
-    # serves primary-age children too, so it must count in BOTH metrics.
-    rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
-    assert _classify(rows) == {
-        ("AA1 1AA", "good_primary"),
-        ("AA1 1AA", "good_secondary"),
-    }
-
-
-def test_age_ranges_assign_single_phase_for_standard_schools():
-    rows = [
-        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),  # primary only
-        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),  # secondary only
-        _aged_school("Secondary", "1", 9, 13, "AA1 1AC"),  # middle -> both
-    ]
-    assert _classify(rows) == {
-        ("AA1 1AA", "outstanding_primary"),
-        ("AA1 1AB", "good_secondary"),
-        ("AA1 1AC", "outstanding_primary"),
-        ("AA1 1AC", "outstanding_secondary"),
-    }
-
-
-def test_closed_schools_excluded_when_open_register_given():
-    rows = [
-        _aged_school("Primary", "1", 4, 11, "AA1 1AA"),
-        _aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
-    ]
-    rows[0]["URN"] = 111
-    rows[1]["URN"] = 222
-    result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
-    pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
-    # URN 222 is not in the open register, so it is dropped.
-    assert pairs == {("AA1 1AA", "outstanding_primary")}
--- a/pipeline/transform/test_transform_poi.py
+++ b/pipeline/transform/test_transform_poi.py
@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
    assert n2_grocery.height == 1


+def test_transform_drops_miscategorised_tags(tmp_path):
+    # Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
+    # slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
+    # alternative medicine), Hospital & Clinic (untyped healthcare/yes),
+    # Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
+    # apparatus). They must be dropped entirely.
+    dropped = [
+        "amenity/bicycle_rental",
+        "amenity/boat_rental",
+        "leisure/marina",
+        "leisure/slipway",
+        "tourism/artwork",
+        "healthcare/yes",
+        "healthcare/alternative",
+        "shop/herbalist",
+        "shop/health",
+        "amenity/fountain",
+        "amenity/courthouse",
+        "leisure/fitness_station",
+    ]
+    raw = pl.DataFrame(
+        {
+            "id": [f"n{i}" for i in range(len(dropped))],
+            "name": [f"POI {i}" for i in range(len(dropped))],
+            "category": dropped,
+            "lat": [51.50] * len(dropped),
+            "lng": [-0.10] * len(dropped),
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
+
+
+def test_transform_splits_hospital_and_clinic(tmp_path):
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2", "n3"],
+            "name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
+            "category": [
+                "amenity/hospital",
+                "amenity/clinic",
+                "healthcare/clinic",
+            ],
+            "lat": [51.50, 51.51, 51.52],
+            "lng": [-0.10, -0.11, -0.12],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
+    assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
+    assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
+    assert "Hospital & Clinic" not in out["category"].to_list()
+
+
+def test_transform_maps_chalet_to_hotel(tmp_path):
+    # Holiday-let chalets are accommodation, not Tourist Attractions.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1"],
+            "name": ["Seaview Chalet"],
+            "category": ["tourism/chalet"],
+            "lat": [51.50],
+            "lng": [-0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
+
+
+def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
+    # leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
+    # unnamed (anonymous tracks/gallops/fishing spots); only named public
+    # facilities survive as a Sports Centre.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1", "n2", "n3", "n4"],
+            "name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
+            "category": [
+                "leisure/track",
+                "leisure/fishing",
+                "leisure/track",
+                "leisure/horse_riding",
+            ],
+            "lat": [51.50, 51.51, 51.52, 51.53],
+            "lng": [-0.10, -0.11, -0.12, -0.13],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+
+    out = transform(**inputs).collect()
+
+    assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
+    named = out.filter(pl.col("id").is_in(["n3", "n4"]))
+    assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
+
+
+def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
+    # NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
+    # flow through with the Public Transport group and its own emoji.
+    raw = pl.DataFrame(
+        {
+            "id": ["n1"],
+            "name": ["A Cafe"],
+            "category": ["amenity/cafe"],
+            "lat": [51.50],
+            "lng": [-0.10],
+        }
+    )
+    inputs = _write_transform_inputs(tmp_path, raw)
+    pl.DataFrame(
+        {
+            "id": ["naptan-1", "naptan-2"],
+            "name": ["Test Rail Station", "Weaste"],
+            "category": ["Rail station", "Tram & Metro stop"],
+            "lat": [51.51, 51.52],
+            "lng": [-0.13, -0.14],
+        }
+    ).write_parquet(inputs["naptan_path"])
+
+    out = transform(**inputs).collect()
+
+    tram = out.filter(pl.col("category") == "Tram & Metro stop")
+    assert tram.height == 1
+    assert tram["group"].to_list() == ["Public Transport"]
+    assert tram["emoji"].to_list() == ["🚊"]
+
+
 def test_transform_output_unique_per_id_category(tmp_path):
    # Soundness: the full transform() output has at most one row per
    # (id, category) overall, across every source.
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -86,6 +86,28 @@ DROP_CATEGORIES = {
    "amenity/water_point",
    "amenity/watering_place",
    "amenity/weighbridge",
+    # Boating/cycle-hire infrastructure formerly miscategorised as
+    # "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
+    # ramps and moorings are not entertainment venues.
+    "amenity/bicycle_rental",
+    "amenity/boat_rental",
+    "leisure/marina",
+    "leisure/slipway",
+    # Public art (statues, murals, village signs) formerly 93% of "Gallery".
+    "tourism/artwork",
+    # Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
+    # "Gym & Fitness".
+    "leisure/fitness_station",
+    # Untyped healthcare rows and non-pharmacy health shops formerly bucketed
+    # under "Hospital & Clinic" / "Pharmacy".
+    "healthcare/yes",
+    "healthcare/alternative",
+    "shop/herbalist",
+    "shop/health",
+    # Street fountains and courthouses formerly bucketed as
+    # "Tourist Attraction".
+    "amenity/fountain",
+    "amenity/courthouse",
    # Niche amenities not useful for home buyers
    "amenity/animal_boarding",
    "amenity/animal_breeding",
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/tanning_salon",
            "shop/amusements",
            "tourism/theme_park",
-            "amenity/bicycle_rental",
-            "amenity/boat_rental",
-            "leisure/marina",
-            "leisure/slipway",
+            # bicycle_rental/boat_rental/marina/slipway used to live here and
+            # made up ~46% of the bucket (cycle-hire docks, boat ramps); they
+            # are infrastructure, not entertainment venues — see DROP_CATEGORIES.
            "leisure/hackerspace",
            "leisure/yes",
        ],
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🏋️",
        [
            "leisure/fitness_centre",
-            "leisure/fitness_station",
+            # leisure/fitness_station (outdoor pull-up bars / trim-trail
+            # apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
            "amenity/dojo",
            "amenity/dancing_school",
        ],
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "amenity/pharmacy",
            "healthcare/pharmacy",
            "shop/chemist",
-            "shop/herbalist",
-            "shop/health",
-            "healthcare/alternative",
+            # healthcare/alternative, shop/herbalist and shop/health (homeopaths,
+            # herbalists, generic "health" shops) are not dispensing pharmacies
+            # — see DROP_CATEGORIES.
+        ],
+    ),
+    # "Hospital & Clinic" used to be one bucket; an actual hospital and a small
+    # clinic are very different amenities for a homebuyer, so they are split.
+    (
+        "Health",
+        "Hospital",
+        "🏥",
+        [
+            "amenity/hospital",
+            "healthcare/hospital",
        ],
    ),
    (
        "Health",
-        "Hospital & Clinic",
-        "🏥",
+        "Clinic",
+        "🩺",
        [
-            "amenity/hospital",
            "amenity/clinic",
            "amenity/health_centre",
            "healthcare/blood_donation",
-            "healthcare/hospital",
            "healthcare/centre",
            "healthcare/clinic",
            "office/healthcare",
            "healthcare/laboratory",
            "healthcare/rehabilitation",
            "healthcare/vaccination_centre",
-            "healthcare/yes",
+            # healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
        ],
    ),
    (
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        "🖼️",
        [
            "tourism/gallery",
-            "tourism/artwork",
+            # tourism/artwork (statues, murals, village signs) was 93% of this
+            # bucket and is not a visitable gallery — see DROP_CATEGORIES.
        ],
    ),
    (
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
        [
            "tourism/attraction",
            "tourism/aquarium",
-            "amenity/fountain",
-            "amenity/courthouse",
-            "tourism/chalet",
+            # amenity/fountain (street furniture) and amenity/courthouse are
+            # dropped; tourism/chalet (holiday lets) moved to "Hotel".
        ],
    ),
    # Note: schools come from the GIAS register (see transform_gias_schools).
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
            "leisure/resort",
            "tourism/holiday_park",
            "tourism/self_catering",
+            # Holiday-let chalets are accommodation, not tourist attractions
+            # (where they previously sat).
+            "tourism/chalet",
        ],
    ),
    (
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
    "leisure/practice_pitch",
    "leisure/swimming_pool",
    "leisure/paddling_pool",
+    # 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
+    # fishing spots; only named public facilities count as a Sports Centre.
+    "leisure/track",
+    "leisure/horse_riding",
+    "leisure/fishing",
 }


@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
    "Bus station": "🚌",
    "Taxi rank": "🚕",
    "Tube station": "🚇",
+    "Tram & Metro stop": "🚊",
 }


@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
    (null/"Not judged", e.g. schools last seen under the post-2024 ungraded
    report-card framework) we fall back to "Ungraded inspection overall outcome"
    so genuinely good/outstanding schools aren't dropped — mirroring
-    school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
+    school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
    grade_col = pl.col("Latest OEIF overall effectiveness")
-    # See school_proximity: the ungraded outcome carries "School remains Good"/
+    # See school_catchments: the ungraded outcome carries "School remains Good"/
    # "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
    # suffixes) when the graded column is null/"Not judged".
    ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
--- a/server-rs/src/data/poi.rs
+++ b/server-rs/src/data/poi.rs
@ -55,6 +55,7 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
        &[
            "Rail station",
            "Tube station",
+            "Tram & Metro stop",
            "Bus station",
            "Bus stop",
            "Airport",
@ -79,7 +80,7 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
    ),
    (
        "Health",
-        &["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
+        &["GP Surgery", "Pharmacy", "Dentist", "Hospital", "Clinic"],
    ),
    (
        "Leisure",
--- a/server-rs/src/features.rs
+++ b/server-rs/src/features.rs
@ -180,20 +180,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
                raw: false,
                absolute: true,
            }),
-            Feature::Enum(EnumFeatureConfig {
-                name: "Within conservation area",
-                order: Some(&["Yes", "No"]),
-                description: "Whether the postcode point falls inside a designated conservation area",
-                detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.",
-                source: "conservation-areas",
-            }),
-            Feature::Enum(EnumFeatureConfig {
-                name: "Listed building",
-                order: Some(&["Yes", "No"]),
-                description: "Whether this property appears to match a Historic England listed building entry",
-                detail: "Historic England National Heritage List for England listed-building points, matched conservatively to property addresses using the listed-entry name and nearby postcode candidates. Treat this as a screening signal, not a legal determination: verify any specific property on the NHLE and with the local planning authority.",
-                source: "listed-buildings",
-            }),
            Feature::Numeric(FeatureConfig {
                name: "Noise (dB)",
                bounds: Bounds::Fixed {
@ -209,6 +195,20 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
                raw: false,
                absolute: false,
            }),
+            Feature::Enum(EnumFeatureConfig {
+                name: "Within conservation area",
+                order: Some(&["Yes", "No"]),
+                description: "Whether the postcode point falls inside a designated conservation area",
+                detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.",
+                source: "conservation-areas",
+            }),
+            Feature::Enum(EnumFeatureConfig {
+                name: "Listed building",
+                order: Some(&["Yes", "No"]),
+                description: "Whether this property appears to match a Historic England listed building entry",
+                detail: "Historic England National Heritage List for England listed-building points, matched conservatively to property addresses using the listed-entry name and nearby postcode candidates. Treat this as a screening signal, not a legal determination: verify any specific property on the NHLE and with the local planning authority.",
+                source: "listed-buildings",
+            }),
        ],
    },
    FeatureGroup {
@ -307,89 +307,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
        name: "Schools",
        features: &[
            Feature::Numeric(FeatureConfig {
-                name: "Good+ primary schools within 2km",
-                bounds: Bounds::Fixed {
-                    min: 0.0,
-                    max: 10.0,
-                },
-                step: 1.0,
-                description: "Primary schools rated Good or Outstanding by Ofsted within 2km",
-                detail: "State-funded primary schools within 2km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
-                source: "ofsted",
-                prefix: "",
-                suffix: "",
-                raw: false,
-                absolute: false,
-            }),
-            Feature::Numeric(FeatureConfig {
-                name: "Good+ secondary schools within 2km",
-                bounds: Bounds::Fixed {
-                    min: 0.0,
-                    max: 5.0,
-                },
-                step: 1.0,
-                description: "Secondary schools rated Good or Outstanding by Ofsted within 2km",
-                detail: "State-funded secondary schools within 2km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
-                source: "ofsted",
-                prefix: "",
-                suffix: "",
-                raw: false,
-                absolute: false,
-            }),
-            Feature::Numeric(FeatureConfig {
-                name: "Outstanding primary schools within 2km",
-                bounds: Bounds::Fixed {
-                    min: 0.0,
-                    max: 10.0,
-                },
-                step: 1.0,
-                description: "Primary schools rated Outstanding by Ofsted within 2km",
-                detail: "State-funded primary schools within 2km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
-                source: "ofsted",
-                prefix: "",
-                suffix: "",
-                raw: false,
-                absolute: false,
-            }),
-            Feature::Numeric(FeatureConfig {
-                name: "Outstanding secondary schools within 2km",
-                bounds: Bounds::Fixed {
-                    min: 0.0,
-                    max: 5.0,
-                },
-                step: 1.0,
-                description: "Secondary schools rated Outstanding by Ofsted within 2km",
-                detail: "State-funded secondary schools within 2km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
-                source: "ofsted",
-                prefix: "",
-                suffix: "",
-                raw: false,
-                absolute: false,
-            }),
-            Feature::Numeric(FeatureConfig {
-                name: "Good+ primary schools within 5km",
-                bounds: Bounds::Fixed {
-                    min: 0.0,
-                    max: 30.0,
-                },
-                step: 1.0,
-                description: "Primary schools rated Good or Outstanding by Ofsted within 5km",
-                detail: "State-funded primary schools within 5km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
-                source: "ofsted",
-                prefix: "",
-                suffix: "",
-                raw: false,
-                absolute: false,
-            }),
-            Feature::Numeric(FeatureConfig {
-                name: "Good+ secondary schools within 5km",
+                name: "Good+ primary school catchments",
                bounds: Bounds::Fixed {
                    min: 0.0,
                    max: 15.0,
                },
                step: 1.0,
-                description: "Secondary schools rated Good or Outstanding by Ofsted within 5km",
-                detail: "State-funded secondary schools within 5km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
+                description: "Primary schools rated Good or Outstanding whose modelled catchment area covers this postcode",
+                detail: "How many state-funded primary schools with a current Ofsted rating of Good or Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
                source: "ofsted",
                prefix: "",
                suffix: "",
@ -397,14 +322,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
                absolute: false,
            }),
            Feature::Numeric(FeatureConfig {
-                name: "Outstanding primary schools within 5km",
+                name: "Good+ secondary school catchments",
                bounds: Bounds::Fixed {
                    min: 0.0,
-                    max: 30.0,
+                    max: 11.0,
                },
                step: 1.0,
-                description: "Primary schools rated Outstanding by Ofsted within 5km",
-                detail: "State-funded primary schools within 5km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
+                description: "Secondary schools rated Good or Outstanding whose modelled catchment area covers this postcode",
+                detail: "How many state-funded secondary schools with a current Ofsted rating of Good or Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
                source: "ofsted",
                prefix: "",
                suffix: "",
@ -412,14 +337,29 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
                absolute: false,
            }),
            Feature::Numeric(FeatureConfig {
-                name: "Outstanding secondary schools within 5km",
+                name: "Outstanding primary school catchments",
                bounds: Bounds::Fixed {
                    min: 0.0,
-                    max: 15.0,
+                    max: 8.0,
                },
                step: 1.0,
-                description: "Secondary schools rated Outstanding by Ofsted within 5km",
-                detail: "State-funded secondary schools within 5km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
+                description: "Primary schools rated Outstanding whose modelled catchment area covers this postcode",
+                detail: "How many state-funded primary schools with a current Ofsted rating of Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
+                source: "ofsted",
+                prefix: "",
+                suffix: "",
+                raw: false,
+                absolute: false,
+            }),
+            Feature::Numeric(FeatureConfig {
+                name: "Outstanding secondary school catchments",
+                bounds: Bounds::Fixed {
+                    min: 0.0,
+                    max: 4.0,
+                },
+                step: 1.0,
+                description: "Secondary schools rated Outstanding whose modelled catchment area covers this postcode",
+                detail: "How many state-funded secondary schools with a current Ofsted rating of Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
                source: "ofsted",
                prefix: "",
                suffix: "",
--- a/server-rs/src/routes/ai_filters.rs
+++ b/server-rs/src/routes/ai_filters.rs
@ -62,6 +62,42 @@ pub struct AiFiltersResponse {
    notes: String,
    /// Number of properties matching the proposed property and travel time filters.
    match_count: usize,
+    /// Bounding box of the matching properties so the client can move the
+    /// camera to where matches actually are. Absent when nothing matches.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    match_bounds: Option<MatchBounds>,
+}
+
+#[derive(Serialize)]
+pub struct MatchBounds {
+    south: f32,
+    west: f32,
+    north: f32,
+    east: f32,
+}
+
+/// Bounding box over matched coordinates, trimmed to the 5th–95th percentile
+/// per axis (when there are enough points) so a handful of remote outliers
+/// doesn't zoom the camera out to all of England.
+fn percentile_trimmed_bounds(mut lats: Vec<f32>, mut lons: Vec<f32>) -> Option<MatchBounds> {
+    if lats.is_empty() || lats.len() != lons.len() {
+        return None;
+    }
+    lats.sort_unstable_by(f32::total_cmp);
+    lons.sort_unstable_by(f32::total_cmp);
+    let last = lats.len() - 1;
+    let (lo, hi) = if lats.len() >= 20 {
+        let trim = lats.len() / 20;
+        (trim, last - trim)
+    } else {
+        (0, last)
+    };
+    Some(MatchBounds {
+        south: lats[lo],
+        north: lats[hi],
+        west: lons[lo],
+        east: lons[hi],
+    })
 }

 /// Strip markdown code fences (```json ... ``` or ``` ... ```) from LLM output.
@ -90,17 +126,12 @@ fn school_feature_name_from_key(name: &str) -> Option<&'static str> {
    let mut parts = rest.split(':');
    let phase = parts.next()?;
    let rating = parts.next()?;
-    let distance = parts.next()?;

-    match (phase, rating, distance) {
-        ("primary", "good", "2") => Some("Good+ primary schools within 2km"),
-        ("secondary", "good", "2") => Some("Good+ secondary schools within 2km"),
-        ("primary", "outstanding", "2") => Some("Outstanding primary schools within 2km"),
-        ("secondary", "outstanding", "2") => Some("Outstanding secondary schools within 2km"),
-        ("primary", "good", "5") => Some("Good+ primary schools within 5km"),
-        ("secondary", "good", "5") => Some("Good+ secondary schools within 5km"),
-        ("primary", "outstanding", "5") => Some("Outstanding primary schools within 5km"),
-        ("secondary", "outstanding", "5") => Some("Outstanding secondary schools within 5km"),
+    match (phase, rating) {
+        ("primary", "good") => Some("Good+ primary school catchments"),
+        ("secondary", "good") => Some("Good+ secondary school catchments"),
+        ("primary", "outstanding") => Some("Outstanding primary school catchments"),
+        ("secondary", "outstanding") => Some("Outstanding secondary school catchments"),
        _ => None,
    }
 }
@ -508,8 +539,8 @@ pub fn build_system_prompt(
         {\"name\": \"Serious crime (avg/yr)\", \"bound\": \"max\", \"value\": 5}, \
         {\"name\": \"Minor crime (avg/yr)\", \"bound\": \"max\", \"value\": 20}, \
         {\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
-         {\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
-         {\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
+         {\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}, \
+         {\"name\": \"Good+ secondary school catchments\", \"bound\": \"min\", \"value\": 1}, \
         {\"name\": \"Number of amenities (Park) within 2km\", \"bound\": \"min\", \"value\": 3}], \
         \"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
            .to_string(),
@ -519,8 +550,8 @@ pub fn build_system_prompt(
        "\nUser: \"quiet area with outstanding schools\"\n\
         Output: {\"numeric_filters\": [\
         {\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
-         {\"name\": \"Outstanding primary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
-         {\"name\": \"Outstanding secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
+         {\"name\": \"Outstanding primary school catchments\", \"bound\": \"min\", \"value\": 1}, \
+         {\"name\": \"Outstanding secondary school catchments\", \"bound\": \"min\", \"value\": 1}], \
         \"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
            .to_string(),
    );
@ -557,8 +588,8 @@ pub fn build_system_prompt(
         Output: {\"numeric_filters\": [\
         {\"name\": \"Total floor area (sqm)\", \"bound\": \"min\", \"value\": 100}, \
         {\"name\": \"Number of bedrooms & living rooms\", \"bound\": \"min\", \"value\": 5}, \
-         {\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
-         {\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
+         {\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}, \
+         {\"name\": \"Good+ secondary school catchments\", \"bound\": \"min\", \"value\": 1}], \
         \"enum_filters\": [{\"name\": \"Property type\", \
         \"values\": [\"Detached\", \"Semi-Detached\"]}], \
         \"travel_time_filters\": [{\"mode\": \"car\", \"slug\": \"manchester\", \
@ -592,7 +623,7 @@ pub fn build_system_prompt(
        "\nUser: \"3 bed house under 500k with good schools\"\n\
         Output: {\
         \"numeric_filters\": [{\"name\": \"Estimated current price\", \"bound\": \"max\", \"value\": 500000}, \
-         {\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}], \
+         {\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}], \
         \"enum_filters\": [{\"name\": \"Property type\", \
         \"values\": [\"Detached\", \"Semi-Detached\", \"Terraced\"]}], \
         \"travel_time_filters\": [], \
@ -759,7 +790,7 @@ fn count_matching_rows(
    state: &AppState,
    filters: &Value,
    travel_time_filters: &[TravelTimeFilter],
-) -> usize {
+) -> (usize, Option<MatchBounds>) {
    let filter_str = filters_to_filter_string(filters);

    let quant = state.data.quant_ref();
@ -778,7 +809,7 @@ fn count_matching_rows(
            Ok(f) => f,
            Err(err) => {
                warn!("Failed to parse filters for match count: {err}");
-                return 0;
+                return (0, None);
            }
        }
    };
@ -800,6 +831,8 @@ fn count_matching_rows(
    let has_poi_filters = !parsed_poi_filters.is_empty();

    let mut count = 0usize;
+    let mut matched_lats: Vec<f32> = Vec::new();
+    let mut matched_lons: Vec<f32> = Vec::new();
    for (row, pc_key) in pc_keys.iter().enumerate().take(num_rows) {
        if !row_passes_filters(
            row,
@ -836,9 +869,11 @@ fn count_matching_rows(
        }

        count += 1;
+        matched_lats.push(state.data.lat[row]);
+        matched_lons.push(state.data.lon[row]);
    }

-    count
+    (count, percentile_trimmed_bounds(matched_lats, matched_lons))
 }

 /// Budget limits for the Gemini conversation loop. Separate counters prevent
@ -1132,7 +1167,7 @@ pub async fn post_ai_filters(
            .to_string();

        // Count matching properties and refine if too restrictive
-        let match_count = count_matching_rows(&state, &filters, &travel_time_filters);
+        let (match_count, match_bounds) = count_matching_rows(&state, &filters, &travel_time_filters);
        info!(
            match_count = match_count,
            round = round,
@ -1173,6 +1208,7 @@ pub async fn post_ai_filters(
                    travel_time_filters,
                    notes,
                    match_count: 0,
+                    match_bounds: None,
                }));
            }

@ -1236,6 +1272,7 @@ pub async fn post_ai_filters(
            travel_time_filters,
            notes,
            match_count,
+            match_bounds,
        }));
    }

@ -1488,9 +1525,14 @@ mod tests {

    #[test]
    fn synthetic_filter_keys_are_normalized_to_backend_names() {
+        assert_eq!(
+            canonical_filter_name("Schools:primary:good:0"),
+            "Good+ primary school catchments"
+        );
+        // Legacy keys still carry a distance segment; it is ignored.
        assert_eq!(
            canonical_filter_name("Schools:primary:good:2:0"),
-            "Good+ primary schools within 2km"
+            "Good+ primary school catchments"
        );
        assert_eq!(
            canonical_filter_name("Specific crimes:Burglary%20%28avg%2Fyr%29:1"),
--- a/server-rs/src/routes/filter_counts.rs
+++ b/server-rs/src/routes/filter_counts.rs
@ -68,8 +68,11 @@ pub async fn get_filter_counts(
    let num_total_filters = num_regular + travel_filter_indices.len();

    if num_total_filters == 0 {
+        // With no active filters the total is simply every property in bounds.
+        // count_in_bounds is O(grid cells), far cheaper than walking every row.
+        let total = state.grid.count_in_bounds(south, west, north, east) as u32;
        return Ok(Json(FilterCountsResponse {
-            total: 0,
+            total,
            impacts: FxHashMap::default(),
        }));
    }