Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -48,7 +48,8 @@ NAPTAN := $(DATA_DIR)/naptan.parquet
BROADBAND := $(DATA_DIR)/broadband.parquet
CONSERVATION_AREAS := $(DATA_DIR)/conservation_areas.geojson
LISTED_BUILDINGS := $(DATA_DIR)/listed_buildings.gpkg
SCHOOL_PROX := $(DATA_DIR)/school_proximity.parquet
SCHOOL_CATCH := $(DATA_DIR)/school_catchments.parquet
LSOA_CHILDREN := $(DATA_DIR)/lsoa_children.parquet
RENTAL := $(DATA_DIR)/rental_prices.parquet
INSPIRE_DIR := $(DATA_DIR)/inspire
OA_BOUNDARIES := $(DATA_DIR)/oa_boundaries.gpkg
@ -100,19 +101,19 @@ PC_BOUNDARIES_DEPS := pipeline/transform/postcode_boundaries/__main__.py \
pipeline/transform/postcode_boundaries/voronoi.py
CRIME_DOWNLOAD_DEPS := pipeline/download/crime.py
INSPIRE_DOWNLOAD_DEPS := pipeline/download/inspire.py
TRANSIT_DOWNLOAD_DEPS := pipeline/download/transit_network.py pipeline/download/transxchange2gtfs_shim.js
TRANSIT_DOWNLOAD_DEPS := pipeline/download/transit_network.py
MAP_ASSETS_DEPS := pipeline/download/map_assets.py pipeline/transform/transform_poi.py
# ── Phony aliases ─────────────────────────────────────────────────────────────
.PHONY: prepare merge tiles satellite-tiles satellite-highres-tiles overlay-tiles noise-overlay-tiles crime-hotspot-tiles tree-overlay-tiles property-border-tiles \
download-arcgis download-price-paid download-deprivation download-ethnicity \
download-naptan download-pois download-grocery-retail-points download-ofsted download-gias download-broadband download-conservation-areas download-listed-buildings download-rental-prices \
download-naptan download-pois download-grocery-retail-points download-ofsted download-gias download-lsoa-children download-broadband download-conservation-areas download-listed-buildings download-rental-prices \
download-postcodes download-noise download-inspire download-crime \
download-oa-boundaries download-uprn-lookup download-transit-network download-greenspace download-os-greenspace download-pbf download-fr-tow download-nfi download-ofs-register download-places download-median-age download-england-boundary download-rightmove-outcodes \
download-map-assets \
transform-pois transform-epc-pp transform-crime transform-poi-proximity \
transform-school-proximity transform-tree-density \
transform-school-catchments transform-tree-density \
generate-postcode-boundaries generate-travel-times enrich-actual-listings
prepare: $(PRICES_STAMP) download-places tiles satellite-tiles overlay-tiles property-border-tiles tree-overlay-tiles crime-hotspot-tiles property-border-tiles generate-postcode-boundaries download-map-assets generate-travel-times | $(POSTCODES_PQ) $(PROPERTIES_PQ) $(PRICE_INDEX)
@ -139,6 +140,7 @@ download-pois: $(POIS_RAW)
download-grocery-retail-points: $(GROCERY_RETAIL_POINTS)
download-ofsted: $(OFSTED)
download-gias: $(GIAS)
download-lsoa-children: $(LSOA_CHILDREN)
download-broadband: $(BROADBAND)
download-conservation-areas: $(CONSERVATION_AREAS)
download-listed-buildings: $(LISTED_BUILDINGS)
@ -150,7 +152,7 @@ download-inspire: $(INSPIRE_STAMP)
download-oa-boundaries: $(OA_BOUNDARIES)
download-uprn-lookup: $(UPRN_LOOKUP)
download-transit-network: $(TRANSIT_STAMP)
$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip
$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip
download-greenspace: $(GREENSPACE)
download-os-greenspace: $(OS_GREENSPACE)
download-pbf: $(PBF)
@ -168,11 +170,11 @@ transform-pois: $(POIS_FILTERED)
transform-epc-pp: $(EPC_PP)
transform-crime: $(CRIME)
transform-poi-proximity: $(POI_PROXIMITY)
transform-school-proximity: $(SCHOOL_PROX)
transform-school-catchments: $(SCHOOL_CATCH)
transform-tree-density: $(TREE_DENSITY_PC)
generate-postcode-boundaries: $(PC_BOUNDARIES_STAMP)
$(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGIS) $(PC_BOUNDARIES_DEPS)
$(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGIS) $(GREENSPACE) $(PC_BOUNDARIES_DEPS)
@rm -f $@
$(VALIDATE_OUTPUTS) --dir $(INSPIRE_DIR) --zip-glob "$(INSPIRE_DIR)::*.zip"
uv run python -m pipeline.transform.postcode_boundaries \
@ -180,6 +182,7 @@ $(PC_BOUNDARIES_STAMP): $(OA_BOUNDARIES) $(INSPIRE_STAMP) $(UPRN_LOOKUP) $(ARCGI
--arcgis $(ARCGIS) \
--oa-boundaries $(OA_BOUNDARIES) \
--inspire $(INSPIRE_DIR) \
--greenspace $(GREENSPACE) \
--output $(PC_BOUNDARIES)
$(VALIDATE_OUTPUTS) --active-postcode-boundary-match "$(ARCGIS)::$(PC_BOUNDARIES)"
@touch $@
@ -273,6 +276,9 @@ $(OFSTED):
$(GIAS): pipeline/download/gias.py
uv run python -m pipeline.download.gias --output $@
$(LSOA_CHILDREN): pipeline/download/lsoa_children.py
uv run python -m pipeline.download.lsoa_children --output $@
$(BROADBAND):
uv run python -m pipeline.download.broadband --output $@
@ -315,7 +321,7 @@ $(UPRN_LOOKUP):
$(TRANSIT_STAMP): $(TRANSIT_DOWNLOAD_DEPS)
@rm -f $@
uv run python -m pipeline.download.transit_network --output $(TRANSIT_DIR)
$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip --zip $(TRANSIT_DIR)/tfl_gtfs.zip
$(VALIDATE_OUTPUTS) --file $(TRANSIT_DIR)/raw/england.osm.pbf --zip $(TRANSIT_DIR)/bods_gtfs.zip
@touch $@
$(RENTAL): pipeline/download/rental_prices.py
@ -364,8 +370,8 @@ $(CRIME) $(CRIME_BY_YEAR) &: $(CRIME_STAMP) $(PC_BOUNDARIES_STAMP) pipeline/tran
$(POI_PROXIMITY): $(ARCGIS) $(POIS_FILTERED) $(OS_GREENSPACE) $(POI_PROXIMITY_DEPS)
uv run python -m pipeline.transform.poi_proximity --arcgis $(ARCGIS) --pois $(POIS_FILTERED) --greenspace $(OS_GREENSPACE) --output $@
$(SCHOOL_PROX): $(OFSTED) $(ARCGIS) $(GIAS) pipeline/transform/school_proximity.py pipeline/utils/poi_counts.py
uv run python -m pipeline.transform.school_proximity --ofsted $(OFSTED) --arcgis $(ARCGIS) --gias $(GIAS) --output $@
$(SCHOOL_CATCH): $(OFSTED) $(ARCGIS) $(GIAS) $(LSOA_CHILDREN) pipeline/transform/school_catchments.py pipeline/utils/poi_counts.py
uv run python -m pipeline.transform.school_catchments --ofsted $(OFSTED) --arcgis $(ARCGIS) --gias $(GIAS) --lsoa-children $(LSOA_CHILDREN) --output $@
$(TREE_DENSITY_PC): $(FR_TOW) $(NFI) $(ARCGIS) $(TREE_DENSITY_DEPS)
uv run python -m pipeline.transform.tree_density \
@ -386,6 +392,7 @@ $(PC_BOUNDARIES):
@echo " --arcgis $(ARCGIS) \\"
@echo " --oa-boundaries $(OA_BOUNDARIES) \\"
@echo " --inspire $(INSPIRE_DIR) \\"
@echo " --greenspace $(GREENSPACE) \\"
@echo " --output $@"
@echo ""
@exit 1
@ -393,7 +400,7 @@ $(PC_BOUNDARIES):
# ── Final merge → postcode.parquet + properties.parquet ──────────────────────
$(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) $(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) $(MERGE_DEPS)
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_CATCH) $(BROADBAND) $(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) $(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) $(MERGE_DEPS)
@rm -f $@
uv run python -m pipeline.transform.merge \
--epc-pp $(EPC_PP) \
@ -403,7 +410,7 @@ $(MERGE_STAMP): $(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
--ethnicity $(ETHNICITY) \
--crime $(CRIME) \
--noise $(NOISE) \
--school-proximity $(SCHOOL_PROX) \
--school-catchments $(SCHOOL_CATCH) \
--broadband $(BROADBAND) \
--conservation-areas $(CONSERVATION_AREAS) \
--listed-buildings $(LISTED_BUILDINGS) \
@ -433,7 +440,7 @@ $(PRICES_STAMP): $(MERGE_STAMP) $(PRICE_INDEX) $(PRICE_ESTIMATE_DEPS) | $(PROPER
$(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(EPC) \
$(EPC_PP) $(ARCGIS) $(IOD) $(POI_PROXIMITY) \
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_PROX) $(BROADBAND) \
$(ETHNICITY) $(CRIME) $(NOISE) $(SCHOOL_CATCH) $(BROADBAND) \
$(CONSERVATION_AREAS) $(LISTED_BUILDINGS) $(RENTAL) \
$(MEDIAN_AGE) $(ELECTION) $(TREE_DENSITY_PC) \
$(MERGE_DEPS) pipeline/utils/fuzzy_join.py
@ -445,7 +452,7 @@ $(ACTUAL_LISTINGS_ENRICHED): $(ACTUAL_LISTINGS_RAW) $(EPC) \
--ethnicity $(ETHNICITY) \
--crime $(CRIME) \
--noise $(NOISE) \
--school-proximity $(SCHOOL_PROX) \
--school-catchments $(SCHOOL_CATCH) \
--broadband $(BROADBAND) \
--conservation-areas $(CONSERVATION_AREAS) \
--listed-buildings $(LISTED_BUILDINGS) \

File diff suppressed because one or more lines are too long

View file

@ -11,7 +11,7 @@ services:
command: >
bash -c "
cargo install cargo-watch &&
cargo watch --poll -i logs/ -x 'run -- --properties /app/property-data4/properties.parquet --postcode-features /app/property-data4/postcode.parquet --pois /app/property-data4/filtered_uk_pois.parquet --places /app/property-data4/places.parquet --tiles /app/property-data4/uk.pmtiles --postcodes /app/property-data4/postcode_boundaries --travel-times /app/property-data4/travel-times --satellite-tiles /app/property-data4/satellite.pmtiles --satellite-highres-tiles /app/property-data4/satellite_highres.pmtiles --noise-overlay-tiles /app/property-data4/noise_lden_10m.pmtiles --crime-hotspot-tiles /app/property-data4/crime_hotspots.pmtiles --tree-overlay-tiles /app/property-data4/trees_outside_woodlands.pmtiles --property-border-tiles /app/property-data4/property_borders.pmtiles'
cargo watch --poll -i logs/ -x 'run -- --properties /app/property-data/properties.parquet --postcode-features /app/property-data/postcode.parquet --pois /app/property-data/filtered_uk_pois.parquet --places /app/property-data/places.parquet --tiles /app/property-data/uk.pmtiles --postcodes /app/property-data/postcode_boundaries --travel-times /app/property-data/travel-times --satellite-tiles /app/property-data/satellite.pmtiles --satellite-highres-tiles /app/property-data/satellite_highres.pmtiles --noise-overlay-tiles /app/property-data/noise_lden_10m.pmtiles --crime-hotspot-tiles /app/property-data/crime_hotspots.pmtiles --tree-overlay-tiles /app/property-data/trees_outside_woodlands.pmtiles --property-border-tiles /app/property-data/property_borders.pmtiles --actual-listings-path /app/finder/data/online_listings_buy_enriched.parquet --crime-by-year-path /app/property-data/crime_by_postcode_by_year.parquet'
"
ports:
- "8001:8001"

Binary file not shown.

After

Width:  |  Height:  |  Size: 932 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 992 B

View file

@ -70,4 +70,14 @@
<changefreq>monthly</changefreq>
<priority>0.6</priority>
</url>
<url>
<loc>https://perfect-postcode.co.uk/terms</loc>
<changefreq>yearly</changefreq>
<priority>0.3</priority>
</url>
<url>
<loc>https://perfect-postcode.co.uk/privacy</loc>
<changefreq>yearly</changefreq>
<priority>0.3</priority>
</url>
</urlset>

View file

@ -107,6 +107,20 @@ const ROUTES = [
description:
'Learn how Perfect Postcode treats saved searches, account data and property research workflows with privacy and security in mind.',
},
{
path: '/terms',
output: 'terms/index.html',
title: 'Terms of Service | Perfect Postcode',
description:
'The terms that govern your use of Perfect Postcode, including lifetime access, acceptable use, data accuracy, payments and refunds.',
},
{
path: '/privacy',
output: 'privacy/index.html',
title: 'Privacy Policy | Perfect Postcode',
description:
'How Perfect Postcode collects, uses and protects your data: account details, payments, saved searches, AI queries, analytics and your UK GDPR rights.',
},
];
const FAQ_SCHEMA_ITEMS = [
@ -325,11 +339,16 @@ async function prerender() {
args: ['--no-sandbox', '--disable-setuid-sandbox'],
});
try {
const baseIndexHtml = cleanBaseIndexHtml(readFileSync(INDEX_PATH, 'utf-8'));
// Every real page renders tens of kB; a few hundred chars means the SPA
// raced hydration and we captured a loading shell.
const MIN_HTML_CHARS = 1000;
const MAX_ATTEMPTS = 3;
for (const route of ROUTES) {
const page = await browser.newPage();
async function renderRoute(route) {
// A fresh context per attempt: pages otherwise share cache/storage, and a
// poisoned chunk-fetch in the shared cache makes a route fail every retry.
const context = await browser.createBrowserContext();
const page = await context.newPage();
// Intercept API requests to prevent real fetches and retry loops.
await page.setRequestInterception(true);
@ -374,6 +393,7 @@ async function prerender() {
}
});
try {
await page.goto(`http://127.0.0.1:${port}${route.path}`, {
waitUntil: 'networkidle0',
timeout: 30000,
@ -402,8 +422,31 @@ async function prerender() {
return root.innerHTML;
});
if (!html || html.length < 100) {
throw new Error(`Prerender produced too little HTML for ${route.path}`);
if (!html || html.length < MIN_HTML_CHARS) {
throw new Error(
`Prerender produced too little HTML for ${route.path} (${html?.length ?? 0} chars)`
);
}
return html;
} finally {
await context.close().catch(() => {});
}
}
try {
const baseIndexHtml = cleanBaseIndexHtml(readFileSync(INDEX_PATH, 'utf-8'));
for (const route of ROUTES) {
let html = null;
for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt += 1) {
try {
html = await renderRoute(route);
break;
} catch (err) {
if (attempt === MAX_ATTEMPTS) throw err;
console.warn(`Retrying ${route.path} (attempt ${attempt} failed: ${err.message})`);
}
}
const updated = updateHead(baseIndexHtml, route).replace(
@ -418,7 +461,6 @@ async function prerender() {
const outputPath = join(DIST_DIR, route.output);
mkdirSync(dirname(outputPath), { recursive: true });
writeFileSync(outputPath, updated);
await page.close();
console.log(`Prerendered ${route.path} (${html.length} chars) into ${route.output}`);
}
} finally {

View file

@ -0,0 +1,68 @@
import { useTranslation } from 'react-i18next';
import { usePageMeta } from '../../hooks/usePageMeta';
import Footer from '../ui/Footer';
import { PRIVACY, TERMS, type LegalDoc } from './legal-content';
export type LegalKind = 'terms' | 'privacy';
const DOCS: Record<LegalKind, LegalDoc> = { terms: TERMS, privacy: PRIVACY };
export default function LegalPage({ kind }: { kind: LegalKind }) {
const { t, i18n } = useTranslation();
const doc = DOCS[kind];
usePageMeta(`${doc.title} | Perfect Postcode`, doc.metaDescription);
const showEnglishNotice = !i18n.language?.toLowerCase().startsWith('en');
return (
<main className="flex-1 overflow-y-auto bg-warm-50 dark:bg-navy-950">
<div className="mx-auto max-w-3xl px-4 py-10 sm:py-14">
<h1 className="text-3xl font-bold text-navy-950 dark:text-warm-100">{doc.title}</h1>
<p className="mt-2 text-sm text-warm-500 dark:text-warm-400">
{t('legal.lastUpdated', { date: doc.lastUpdated })}
</p>
{showEnglishNotice && (
<p className="mt-2 text-sm italic text-warm-500 dark:text-warm-400">
{t('legal.englishOnly')}
</p>
)}
<div className="mt-6 space-y-4">
{doc.intro.map((paragraph) => (
<p key={paragraph} className="leading-relaxed text-warm-700 dark:text-warm-300">
{paragraph}
</p>
))}
</div>
<div className="mt-8 space-y-8">
{doc.sections.map((section) => (
<section key={section.heading}>
<h2 className="text-lg font-semibold text-navy-950 dark:text-warm-100">
{section.heading}
</h2>
{section.paragraphs.map((paragraph) => (
<p
key={paragraph}
className="mt-2 leading-relaxed text-warm-700 dark:text-warm-300"
>
{paragraph}
</p>
))}
{section.bullets && (
<ul className="mt-2 list-disc space-y-1.5 pl-5 text-warm-700 dark:text-warm-300">
{section.bullets.map((bullet) => (
<li key={bullet} className="leading-relaxed">
{bullet}
</li>
))}
</ul>
)}
</section>
))}
</div>
</div>
<Footer />
</main>
);
}

View file

@ -0,0 +1,183 @@
/**
* Legal documents are maintained in English only; the English text is the
* authoritative version (a localized notice says so on the page). Keeping
* legal copy out of the i18n catalogues avoids meaning drift in translation.
*
* TODO before launch: confirm the operator/legal-entity details below.
*/
export interface LegalSection {
heading: string;
paragraphs: string[];
bullets?: string[];
}
export interface LegalDoc {
title: string;
metaDescription: string;
lastUpdated: string;
intro: string[];
sections: LegalSection[];
}
export const SUPPORT_EMAIL = 'support@perfect-postcode.co.uk';
export const TERMS: LegalDoc = {
title: 'Terms of Service',
metaDescription:
'The terms that govern your use of Perfect Postcode, including lifetime access, acceptable use, data accuracy, payments and refunds.',
lastUpdated: '10 June 2026',
intro: [
`These terms govern your use of perfect-postcode.co.uk ("Perfect Postcode", "the service", "we", "us"). By creating an account or purchasing access you agree to them. If you have any questions, contact ${SUPPORT_EMAIL}.`,
],
sections: [
{
heading: '1. The service',
paragraphs: [
'Perfect Postcode is a research tool that combines public datasets about England — property transactions, energy certificates, schools, crime, noise, broadband, transport and more — on an interactive map, so you can shortlist areas that fit your needs before booking viewings.',
'We are not an estate agent, mortgage broker, surveyor or financial adviser, and the service does not provide financial, legal or investment advice.',
],
},
{
heading: '2. Accounts',
paragraphs: [
'You need an account to use the service beyond the free demo area. Provide a valid email address, keep your credentials secure, and do not share your account. Accounts are for one person each.',
'We may suspend or close accounts that breach these terms, abuse the service, or attempt to circumvent access restrictions. If we close your account without cause, we will refund the price you paid.',
],
},
{
heading: '3. Free demo and lifetime access',
paragraphs: [
'Free accounts can explore all features within the demo area (inner London). Lifetime access is a one-time payment that gives your account ongoing access to the paid map — every postcode, every filter — for as long as the service runs. It is not a subscription, and routine data updates are included.',
'Lifetime access is personal and non-transferable, and is for personal, non-commercial property research. If you would like to use Perfect Postcode commercially (for example in lettings, relocation or research services), contact us first.',
],
},
{
heading: '4. Acceptable use',
paragraphs: ['You agree not to:'],
bullets: [
'scrape, crawl or bulk-download data outside the export tools we provide;',
'resell, republish or redistribute the data or substantial extracts of it;',
'probe, disrupt or place unreasonable load on the service;',
'use the AI search or other features to process content you have no right to submit.',
],
},
{
heading: '5. Data accuracy',
paragraphs: [
'The maps and figures are built from public datasets (HM Land Registry, EPC register, ONS, Ofsted, DfT, police.uk and others) combined with modelling and estimation. Sources can be incomplete, out of date or wrong at the level of an individual property, and our estimates — including estimated current prices — are statistical indications, not valuations.',
'Always verify anything that matters in person and through professional advice (surveys, solicitors, mortgage advisers) before making offers or financial decisions. We provide the service "as is" and do not warrant that any figure is accurate, complete or current.',
],
},
{
heading: '6. Payments and refunds',
paragraphs: [
'Payments are processed by Stripe; we never see or store your card details. Prices are shown in pounds sterling at checkout. Early-access pricing tiers can change as tiers fill; the price shown at the moment you pay is the price you get.',
`If Perfect Postcode is not for you, email ${SUPPORT_EMAIL} within 14 days of purchase and we will refund you in full.`,
],
},
{
heading: '7. Third-party content',
paragraphs: [
'Street View imagery, listing-portal links and similar embedded content are provided by third parties and governed by their own terms. We are not responsible for their availability or accuracy.',
],
},
{
heading: '8. Liability',
paragraphs: [
'To the extent permitted by law, we are not liable for decisions made in reliance on the data, for indirect or consequential losses, or for interruptions to the service; our total liability to you is limited to the amount you paid us. Nothing in these terms excludes liability that cannot legally be excluded, and nothing affects your statutory rights as a consumer.',
],
},
{
heading: '9. Changes to the service or these terms',
paragraphs: [
'We are a small product that improves continuously; features and data sources may change. We may update these terms, and will note the date of the latest revision above. If a change is material we will flag it on the site or by email. Continued use after a change means you accept the updated terms.',
],
},
{
heading: '10. Governing law and contact',
paragraphs: [
`These terms are governed by the law of England and Wales, and disputes are subject to the jurisdiction of the courts of England and Wales (consumers keep any mandatory protections of their country of residence). Questions and complaints: ${SUPPORT_EMAIL} — we typically respond within 24 hours.`,
],
},
],
};
export const PRIVACY: LegalDoc = {
title: 'Privacy Policy',
metaDescription:
'How Perfect Postcode collects, uses and protects your data: account details, payments, saved searches, AI queries, analytics and your UK GDPR rights.',
lastUpdated: '10 June 2026',
intro: [
`This policy explains what personal data Perfect Postcode ("we", "us") collects, why, and your rights over it. We handle personal data under UK data-protection law (UK GDPR and the Data Protection Act 2018). Contact: ${SUPPORT_EMAIL}.`,
],
sections: [
{
heading: '1. What we collect',
paragraphs: [],
bullets: [
'Account data: your email address, a hashed password (or your Google account identifier if you sign in with Google), newsletter preference and access status.',
'Purchase records: what you bought and when. Payments are processed by Stripe; we never receive your card details.',
'Things you create: saved searches, shared links and their settings.',
'AI search queries: the text you type into the AI search is processed to generate filters and logged with your account so we can debug and improve the feature.',
'Usage data: which pages and features are used, collected as events for product analytics, and standard server logs (IP address, user agent) kept for security.',
],
},
{
heading: '2. How we use it',
paragraphs: [],
bullets: [
'To provide and secure the service, including signing you in and remembering your saved work (performance of contract).',
'To process payments and keep the records tax law requires (legal obligation).',
'To answer support requests (performance of contract).',
'To send the newsletter, only if you opted in — every email includes an unsubscribe link (consent).',
'To understand how features are used and improve them, using aggregated analytics and logged AI queries (legitimate interests).',
],
},
{
heading: '3. Who we share it with',
paragraphs: [
'We do not sell personal data. We use a small number of processors to run the service:',
],
bullets: [
'Stripe — payment processing.',
'Google — sign-in (if you choose Google OAuth), embedded Maps/Street View imagery, and the Gemini API which processes the text of AI searches.',
'Hosting and infrastructure providers that run our servers and store backups.',
],
},
{
heading: '4. International transfers',
paragraphs: [
'Some processors (such as Stripe and Google) process data outside the UK. Where that happens, transfers rely on UK adequacy decisions or standard contractual clauses.',
],
},
{
heading: '5. Cookies and local storage',
paragraphs: [
'We do not use advertising cookies or third-party trackers. Your browsers local storage holds your sign-in token and preferences (theme, language, tutorial progress, last map view). Embedded Google content (Street View, sign-in) may set its own cookies under Googles policies.',
],
},
{
heading: '6. Retention',
paragraphs: [
'Account data is kept while your account exists and deleted when you ask us to close it. Server logs are kept for a short period for security. Purchase records are kept for as long as tax law requires (typically six years).',
],
},
{
heading: '7. Your rights',
paragraphs: [
`You can ask for a copy of your data, have it corrected or deleted, restrict or object to processing, and receive your data in a portable format. Email ${SUPPORT_EMAIL} and we will respond promptly. If you are unhappy with how we handle your data you can complain to the Information Commissioners Office (ico.org.uk).`,
],
},
{
heading: '8. Children',
paragraphs: ['The service is aimed at home buyers and renters and is not directed at children under 16.'],
},
{
heading: '9. Changes to this policy',
paragraphs: [
'We will post any changes here and update the date at the top. Material changes will be flagged on the site or by email.',
],
},
],
};

View file

@ -0,0 +1,79 @@
import { useTranslation } from 'react-i18next';
import { LogoIcon } from './icons/LogoIcon';
const SUPPORT_EMAIL = 'support@perfect-postcode.co.uk';
function FooterLink({ href, label }: { href: string; label: string }) {
return (
<li>
<a
href={href}
className="text-sm text-warm-500 hover:text-teal-600 dark:text-warm-400 dark:hover:text-teal-400 transition-colors"
>
{label}
</a>
</li>
);
}
export default function Footer() {
const { t } = useTranslation();
const year = new Date().getFullYear();
return (
<footer className="border-t border-warm-200 bg-warm-50 dark:border-warm-800 dark:bg-navy-950">
<div className="mx-auto max-w-6xl px-4 py-10">
<div className="grid gap-8 sm:grid-cols-2 md:grid-cols-4">
<div>
<a href="/" className="flex items-center gap-2 hover:opacity-80 transition-opacity">
<LogoIcon className="h-5 w-5 shrink-0 text-teal-500" />
<span className="text-base font-semibold text-navy-950 dark:text-teal-300">
{t('header.appName')}
</span>
</a>
<p className="mt-3 text-sm leading-relaxed text-warm-500 dark:text-warm-400">
{t('footer.tagline')}
</p>
</div>
<nav aria-label={t('footer.product')}>
<h2 className="text-xs font-semibold uppercase tracking-wide text-warm-400 dark:text-warm-500">
{t('footer.product')}
</h2>
<ul className="mt-3 space-y-2">
<FooterLink href="/dashboard" label={t('header.dashboard')} />
<FooterLink href="/pricing" label={t('header.pricing')} />
<FooterLink href="/learn" label={t('header.learn')} />
</ul>
</nav>
<nav aria-label={t('footer.resources')}>
<h2 className="text-xs font-semibold uppercase tracking-wide text-warm-400 dark:text-warm-500">
{t('footer.resources')}
</h2>
<ul className="mt-3 space-y-2">
<FooterLink href="/data-sources" label={t('footer.dataSources')} />
<FooterLink href="/methodology" label={t('footer.methodology')} />
<FooterLink href={`mailto:${SUPPORT_EMAIL}`} label={t('footer.contact')} />
</ul>
</nav>
<nav aria-label={t('footer.legal')}>
<h2 className="text-xs font-semibold uppercase tracking-wide text-warm-400 dark:text-warm-500">
{t('footer.legal')}
</h2>
<ul className="mt-3 space-y-2">
<FooterLink href="/terms" label={t('footer.terms')} />
<FooterLink href="/privacy" label={t('footer.privacy')} />
</ul>
</nav>
</div>
<div className="mt-10 flex flex-col gap-2 border-t border-warm-200 pt-6 text-xs text-warm-400 dark:border-warm-800 dark:text-warm-500 sm:flex-row sm:items-center sm:justify-between">
<p>{t('footer.copyright', { year })}</p>
<p>{t('footer.coverage')}</p>
</div>
</div>
</footer>
);
}

View file

@ -0,0 +1,18 @@
import { useEffect, useState } from 'react';
/**
* Tracks whether dark mode is active by observing the html.dark class.
* Useful in components that don't receive the theme as a prop (showcase,
* pricing backdrop) but must keep canvas/map content in sync with it.
*/
export function useIsDarkTheme(): boolean {
const [isDark, setIsDark] = useState(() => document.documentElement.classList.contains('dark'));
useEffect(() => {
const observer = new MutationObserver(() =>
setIsDark(document.documentElement.classList.contains('dark'))
);
observer.observe(document.documentElement, { attributes: true, attributeFilter: ['class'] });
return () => observer.disconnect();
}, []);
return isDark;
}

View file

@ -0,0 +1,35 @@
import { describe, expect, it } from 'vitest';
import { MAP_MIN_ZOOM } from './consts';
import { boundsToCenterZoom } from './fit-bounds';
describe('boundsToCenterZoom', () => {
it('centers on the middle of the box', () => {
const target = boundsToCenterZoom({ south: 51.4, north: 51.6, west: -0.3, east: 0.1 });
expect(target.lat).toBeCloseTo(51.5, 5);
expect(target.lng).toBeCloseTo(-0.1, 5);
});
it('zooms close for a small box and far out for a country-sized box', () => {
const street = boundsToCenterZoom({ south: 51.5, north: 51.51, west: -0.11, east: -0.1 });
const england = boundsToCenterZoom({ south: 50.0, north: 55.5, west: -5.7, east: 1.8 });
expect(street.zoom).toBeGreaterThan(england.zoom);
expect(england.zoom).toBeGreaterThanOrEqual(MAP_MIN_ZOOM);
// Greater London-ish box should land in a sensible city-scale zoom range
const london = boundsToCenterZoom({ south: 51.44, north: 51.59, west: -0.31, east: 0.05 });
expect(london.zoom).toBeGreaterThan(8);
expect(london.zoom).toBeLessThan(12);
});
it('caps zoom-in for degenerate (single point) boxes', () => {
const point = boundsToCenterZoom({ south: 51.5, north: 51.5, west: -0.1, east: -0.1 });
expect(point.zoom).toBeLessThanOrEqual(13);
});
it('tolerates swapped corners', () => {
const target = boundsToCenterZoom({ south: 51.6, north: 51.4, west: 0.1, east: -0.3 });
expect(target.lat).toBeCloseTo(51.5, 5);
expect(target.lng).toBeCloseTo(-0.1, 5);
expect(Number.isFinite(target.zoom)).toBe(true);
});
});

View file

@ -0,0 +1,45 @@
import { MAP_MIN_ZOOM } from './consts';
export interface GeoBounds {
south: number;
west: number;
north: number;
east: number;
}
/**
* Nominal viewport used to derive a zoom from a bounding box. The map only
* exposes flyTo(lat, lng, zoom), so we approximate fitBounds; being half a
* zoom level off for unusual window sizes is fine for "show me the matches".
*/
const NOMINAL_VIEWPORT = { width: 1000, height: 700 };
const TILE_SIZE = 512;
/** Keep matches comfortably inside the viewport edges. */
const ZOOM_PADDING = 0.4;
const MAX_FIT_ZOOM = 13;
function mercatorY(lat: number): number {
const rad = (lat * Math.PI) / 180;
return Math.log(Math.tan(Math.PI / 4 + rad / 2));
}
/** Convert a bounding box into a flyTo target that roughly fits it on screen. */
export function boundsToCenterZoom(bounds: GeoBounds): { lat: number; lng: number; zoom: number } {
const south = Math.min(bounds.south, bounds.north);
const north = Math.max(bounds.south, bounds.north);
const west = Math.min(bounds.west, bounds.east);
const east = Math.max(bounds.west, bounds.east);
const lonSpan = Math.max(east - west, 1e-6);
const mercSpan = Math.max(mercatorY(north) - mercatorY(south), 1e-6);
const zoomX = Math.log2((NOMINAL_VIEWPORT.width * 360) / (TILE_SIZE * lonSpan));
const zoomY = Math.log2((NOMINAL_VIEWPORT.height * 2 * Math.PI) / (TILE_SIZE * mercSpan));
const zoom = Math.max(MAP_MIN_ZOOM, Math.min(MAX_FIT_ZOOM, Math.min(zoomX, zoomY) - ZOOM_PADDING));
return {
lat: (south + north) / 2,
lng: (west + east) / 2,
zoom,
};
}

View file

@ -0,0 +1,297 @@
"""Evaluate modelled school catchment radii against published cutoffs.
Local authorities publish each school's "last distance offered" in their
yearly allocation reports; ``property-data/ground_truth/cutoffs_*.json``
holds a scraped sample of those figures (see the collection notes in each
file's ``source_url`` fields). This script matches them to the per-school
radii emitted by ``pipeline.transform.school_catchments --schools-output``
and reports how well the model reproduces reality, so the preference-bonus
constants can be calibrated.
Headline metrics use non-faith schools whose published cutoff was a binding
distance. Faith schools are reported separately (their distance criterion
applies within faith priority, so published figures aren't comparable), as
are "all applicants offered" schools, where the model should ideally show no
binding cutoff.
"""
import argparse
import difflib
import json
import re
from pathlib import Path
import numpy as np
import polars as pl
_NOISE_WORDS = re.compile(
r"\b(the|of|and|c\s*of\s*e|cofe|ce|rc|voluntary|aided|controlled|va|vc)\b"
)
_NON_ALNUM = re.compile(r"[^a-z0-9 ]")
_SCHOOL_WORDS = re.compile(
r"\b(school|academy|primary|secondary|junior|infant|community|college|high)\b"
)
def normalize_name(name: str, strip_school_words: bool = False) -> str:
s = name.lower().replace("&", " and ").replace("st.", "st ").replace("'", "")
s = _NON_ALNUM.sub(" ", s)
s = _NOISE_WORDS.sub(" ", s)
if strip_school_words:
s = _SCHOOL_WORDS.sub(" ", s)
return " ".join(s.split())
def normalize_la(la: str) -> str:
s = _NON_ALNUM.sub(" ", la.lower().replace("&", " and "))
return " ".join(s.replace("city of", "").split())
def load_ground_truth(directory: Path) -> pl.DataFrame:
rows = []
for path in sorted(directory.glob("cutoffs_*.json")):
for row in json.loads(path.read_text()):
rows.append(
{
"school_name": row["school_name"],
"la": row["la"],
"phase": row["phase"],
"entry_year": int(row.get("entry_year") or 0),
"cutoff_km": (
float(row["cutoff_km"]) if row.get("cutoff_km") is not None else None
),
"all_offered": bool(row.get("all_offered", False)),
"faith_school": bool(row.get("faith_school", False)),
"school_postcode": row.get("school_postcode"),
"source_url": row.get("source_url", ""),
}
)
if not rows:
raise SystemExit(f"No cutoffs_*.json files with rows under {directory}")
df = pl.DataFrame(rows, schema_overrides={"school_postcode": pl.Utf8})
print(f"Ground truth rows: {len(df)} from {directory}")
return df
def match_schools(truth: pl.DataFrame, gias: pl.DataFrame) -> pl.DataFrame:
"""Attach GIAS URNs to ground-truth rows by postcode, then name."""
def stripped(name: str) -> str:
return normalize_name(name, strip_school_words=True)
gias = gias.with_columns(
pl.col("name")
.map_elements(normalize_name, return_dtype=pl.Utf8)
.alias("_name_norm"),
pl.col("name")
.map_elements(stripped, return_dtype=pl.Utf8)
.alias("_name_stripped"),
pl.col("local_authority")
.map_elements(normalize_la, return_dtype=pl.Utf8)
.alias("_la_norm"),
pl.col("postcode").str.replace_all(" ", "").str.to_uppercase().alias("_pc"),
)
truth = truth.with_columns(
pl.col("school_name")
.map_elements(normalize_name, return_dtype=pl.Utf8)
.alias("_name_norm"),
pl.col("school_name")
.map_elements(stripped, return_dtype=pl.Utf8)
.alias("_name_stripped"),
pl.col("la")
.map_elements(normalize_la, return_dtype=pl.Utf8)
.alias("_la_norm"),
pl.col("school_postcode")
.str.replace_all(" ", "")
.str.to_uppercase()
.alias("_pc"),
).with_row_index("_row_id")
# 1. Exact postcode match (unique postcodes only — site-sharing schools
# would mismatch phases otherwise; those fall through to name matching).
pc_unique = gias.filter(pl.col("_pc").is_not_null()).unique(
subset="_pc", keep="none"
)
by_pc = truth.filter(pl.col("_pc").is_not_null()).join(
pc_unique.select("_pc", "urn"), on="_pc", how="inner"
)
matched_ids = set(by_pc["_row_id"].to_list())
# 2. Exact normalized (name, LA) match, unique on both sides.
gias_named = gias.unique(subset=["_name_norm", "_la_norm"], keep="none")
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
by_name = remaining.join(
gias_named.select("_name_norm", "_la_norm", "urn"),
on=["_name_norm", "_la_norm"],
how="inner",
)
matched_ids |= set(by_name["_row_id"].to_list())
# 3. Reports often print informal names ("Ashmole Primary" for "Ashmole
# Primary School"): match on names with school-type words stripped,
# unique on both sides so site-sharing infant/junior pairs fall through.
gias_stripped = gias.filter(pl.col("_name_stripped") != "").unique(
subset=["_name_stripped", "_la_norm"], keep="none"
)
remaining = truth.filter(
(~pl.col("_row_id").is_in(list(matched_ids))) & (pl.col("_name_stripped") != "")
).unique(subset=["_name_stripped", "_la_norm", "phase"], keep="none")
by_stripped = remaining.join(
gias_stripped.select("_name_stripped", "_la_norm", "urn"),
on=["_name_stripped", "_la_norm"],
how="inner",
)
matched_ids |= set(by_stripped["_row_id"].to_list())
# 4. Fuzzy name match within the LA: unique best candidate >= 0.87.
remaining = truth.filter(~pl.col("_row_id").is_in(list(matched_ids)))
fuzzy_rows = []
gias_by_la: dict[str, pl.DataFrame] = {}
for row in remaining.iter_rows(named=True):
la = row["_la_norm"]
if la not in gias_by_la:
gias_by_la[la] = gias.filter(pl.col("_la_norm") == la)
candidates = gias_by_la[la]
if candidates.is_empty():
continue
scores = [
difflib.SequenceMatcher(None, row["_name_norm"], cand).ratio()
for cand in candidates["_name_norm"].to_list()
]
order = np.argsort(scores)[::-1]
if scores[order[0]] >= 0.87 and (
len(order) == 1 or scores[order[1]] < scores[order[0]] - 0.04
):
fuzzy_rows.append({**row, "urn": candidates["urn"][int(order[0])]})
by_fuzzy = (
pl.DataFrame(fuzzy_rows).with_columns(pl.col("_row_id").cast(pl.UInt32))
if fuzzy_rows
else None
)
parts = [by_pc, by_name, by_stripped] + ([by_fuzzy] if by_fuzzy is not None else [])
matched = pl.concat(
[p.select(truth.columns + ["urn"]) for p in parts if not p.is_empty()]
).unique(subset="_row_id", keep="first")
print(
f"Matched {len(matched)}/{len(truth)} ground-truth rows to GIAS URNs "
f"(postcode {len(by_pc)}, exact name {len(by_name)}, "
f"stripped {len(by_stripped)}, fuzzy {0 if by_fuzzy is None else len(by_fuzzy)})"
)
return matched
def evaluate(matched: pl.DataFrame, radii: pl.DataFrame) -> pl.DataFrame:
joined = matched.join(radii, on=["urn", "phase"], how="inner")
print(f"Joined to modelled radii: {len(joined)} rows")
# Published figures occasionally include non-typical admits (a child who
# moved mid-process can print as hundreds of km); cap at distances a
# distance criterion can plausibly produce.
binding = joined.filter(
~pl.col("all_offered")
& pl.col("cutoff_km").is_between(0.05, 20.0)
)
def report(df: pl.DataFrame, label: str) -> None:
if df.is_empty():
print(f"\n{label}: no rows")
return
truth_km = df["cutoff_km"].to_numpy()
model_km = df["radius_km"].to_numpy()
log_ratio = np.log2(model_km / truth_km)
within2 = float(np.mean(np.abs(log_ratio) <= 1))
rank = (
pl.DataFrame({"t": truth_km, "m": model_km})
.select(pl.corr("t", "m", method="spearman"))
.item()
)
print(
f"\n{label} (n={len(df)}):\n"
f" median bias (log2 model/truth): {np.median(log_ratio):+.2f} "
f"(x{2 ** np.median(log_ratio):.2f})\n"
f" median |log2 error|: {np.median(np.abs(log_ratio)):.2f} "
f"(x{2 ** np.median(np.abs(log_ratio)):.2f})\n"
f" within factor 2: {within2:.0%}\n"
f" Spearman rank corr: {rank:.2f}"
)
for phase in ("primary", "secondary"):
report(
binding.filter((pl.col("phase") == phase) & ~pl.col("faith_school")),
f"BINDING, non-faith, {phase}",
)
report(binding.filter(pl.col("faith_school")), "BINDING, faith (informational)")
offered = joined.filter(pl.col("all_offered"))
if not offered.is_empty():
unbound_share = float((~offered["filled"]).mean())
print(
f"\nALL-OFFERED schools (n={len(offered)}): model agrees no binding "
f"cutoff for {unbound_share:.0%}; median modelled radius "
f"{offered['radius_km'].median():.2f} km"
)
return binding
def main() -> None:
parser = argparse.ArgumentParser(
description="Compare modelled catchment radii with published cutoffs"
)
parser.add_argument(
"--ground-truth-dir",
type=Path,
default=Path("property-data/ground_truth"),
)
parser.add_argument(
"--radii",
type=Path,
default=Path("property-data/school_catchment_radii.parquet"),
help="Per-school radii parquet from school_catchments --schools-output",
)
parser.add_argument("--gias", type=Path, default=Path("property-data/gias.parquet"))
parser.add_argument(
"--matched-out",
type=Path,
default=None,
help="Optional CSV of matched rows for inspection",
)
args = parser.parse_args()
truth = load_ground_truth(args.ground_truth_dir)
# One row per school+phase: keep the most recent entry year.
truth = (
truth.sort("entry_year", descending=True)
.unique(subset=["school_name", "la", "phase"], keep="first")
)
gias = pl.read_parquet(args.gias).select(
"urn", "name", "postcode", "local_authority", "religious_character"
)
radii = pl.read_parquet(args.radii).unique(subset=["urn", "phase"], keep="first")
matched = match_schools(truth, gias.drop("religious_character"))
# GIAS religious character is authoritative; the scraped name-based flag
# only covers rows that failed to match.
matched = matched.join(
gias.select("urn", "religious_character"), on="urn", how="left"
).with_columns(
pl.when(pl.col("religious_character").is_not_null())
.then(~pl.col("religious_character").is_in(["None", "Does not apply"]))
.otherwise(pl.col("faith_school"))
.alias("faith_school")
)
binding = evaluate(matched, radii)
if args.matched_out is not None:
out = matched.join(radii, on=["urn", "phase"], how="inner").drop(
"_row_id", "_name_norm", "_la_norm", "_pc"
)
args.matched_out.parent.mkdir(parents=True, exist_ok=True)
out.write_csv(args.matched_out)
print(f"\nWrote matched rows to {args.matched_out}")
if binding.is_empty():
raise SystemExit("No binding, matchable cutoffs — nothing to calibrate on")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,93 @@
"""Download Census 2021 children by five-year age band per LSOA.
Source: NOMIS (ONS Census 2021 TS007A dataset, age by five-year bands)
License: Open Government Licence v3.0
Used to estimate how many primary-age (4-10) and secondary-age (11-15)
children live in each LSOA, which drives the school catchment model. Census
bands don't align with school phases, so phase totals take fractional shares
of the 0-4, 10-14 and 15-19 bands (one fifth per single year of age).
"""
import argparse
from io import BytesIO
from pathlib import Path
import httpx
import polars as pl
# NOMIS API: Census 2021 TS007A (age, five-year bands) by LSOA 2021 (TYPE151).
# c2021_age_19 codes: 1 = 0-4, 2 = 5-9, 3 = 10-14, 4 = 15-19.
# NOMIS paginates at 25,000 rows by default, so we paginate with recordoffset.
BASE_URL = (
"https://www.nomisweb.co.uk/api/v01/dataset/NM_2020_1.data.csv"
"?date=latest&geography=TYPE151&measures=20100&c2021_age_19=1,2,3,4"
"&select=GEOGRAPHY_CODE,C2021_AGE_19,OBS_VALUE"
)
PAGE_SIZE = 25000
AGE_BAND_COLUMNS = {
1: "aged_0_4",
2: "aged_5_9",
3: "aged_10_14",
4: "aged_15_19",
}
def download_and_convert(output_path: Path) -> None:
print("Downloading Census 2021 LSOA age bands from NOMIS...")
frames = []
offset = 0
while True:
url = f"{BASE_URL}&recordoffset={offset}"
response = httpx.get(url, follow_redirects=True, timeout=120)
response.raise_for_status()
if len(response.content) == 0:
break
chunk = pl.read_csv(BytesIO(response.content))
if chunk.height == 0:
break
frames.append(chunk)
print(f" Fetched {chunk.height} rows (offset={offset})")
if chunk.height < PAGE_SIZE:
break
offset += PAGE_SIZE
df = pl.concat(frames)
print(f"Total rows: {df.height}")
result = (
df.rename({"GEOGRAPHY_CODE": "lsoa21"})
.pivot(on="C2021_AGE_19", index="lsoa21", values="OBS_VALUE")
.rename({str(code): name for code, name in AGE_BAND_COLUMNS.items()})
.with_columns(pl.col(name).cast(pl.UInt32) for name in AGE_BAND_COLUMNS.values())
.filter(pl.col("lsoa21").str.starts_with("E"))
.sort("lsoa21")
)
missing = [c for c in AGE_BAND_COLUMNS.values() if c not in result.columns]
if missing:
raise ValueError(f"NOMIS response missing age bands: {missing}")
print(f"England LSOAs: {result.height}")
for name in AGE_BAND_COLUMNS.values():
print(f" {name}: total {result[name].sum():,}")
output_path.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Download Census 2021 age bands (children) by LSOA"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
download_and_convert(args.output)
if __name__ == "__main__":
main()

View file

@ -12,8 +12,18 @@ import polars as pl
NAPTAN_CSV_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
TUBE_STATION_CATEGORY = "Tube station"
TRAM_METRO_CATEGORY = "Tram & Metro stop"
TUBE_STATION_MERGE_RADIUS_DEGREES = 0.01
# London Underground ATCO codes are "<area><kind>ZZLU<station>": a 3-digit
# AdministrativeAreaCode (940 national, 490 London, plus 150/210/040/... for
# LU stations outside Greater London such as Epping or Amersham), then "0"
# (platform/entrance node) or "G" (station group node), then the system code.
# "ZZLU" is unique to London Underground, which cleanly separates genuine Tube
# stations from every other TMU/MET network (Metrolink, Supertram, T&W Metro,
# WM Metro, Blackpool Tramway, heritage railways, ...).
LONDON_UNDERGROUND_ATCO_PATTERN = r"(?i)^\d{3}[0G]ZZLU"
STOP_TYPES = {
"AIR": "Airport",
@ -25,25 +35,110 @@ STOP_TYPES = {
"RLY": "Rail station",
"RSE": "Rail station",
"BCT": "Bus stop",
# Bus/coach stations: BST is the station access-area node, BCS/BCQ are
# bays/stands within the station and BCE is a station entrance. NaPTAN maps
# very few BCE nodes (~80 GB-wide), so without BST/BCS/BCQ the category was
# so sparse that 20% of England showed the nearest bus station >100km away.
# Bays and entrances collapse to one POI per station via
# STATION_MERGE_CATEGORIES below.
"BST": "Bus station",
"BCS": "Bus station",
"BCQ": "Bus station",
"BCE": "Bus station",
"TXR": "Taxi rank",
"TMU": "Tube station",
"MET": "Tube station",
# Tram/Metro/Underground: TMU is an entrance node, MET the station access
# area. Both start as "Tram & Metro stop"; merged stations whose ATCO codes
# mark them as London Underground (ZZLU) are reclassified to "Tube station"
# after dedup (see _deduplicate_station_areas). Heritage railways (RHDR,
# Severn Valley, ...) are TMU/MET in NaPTAN with no machine-readable
# "heritage" flag, so they remain in "Tram & Metro stop".
"TMU": TRAM_METRO_CATEGORY,
"MET": TRAM_METRO_CATEGORY,
}
# Stop types that are access/entrance nodes rather than the primary station or
# terminal node. During dedup the primary node (e.g. RLY/FER) wins so a station
# with both a station node and entrances yields one POI at the station node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD"}
# terminal node. During dedup the primary node (e.g. RLY/FER/MET) wins so a
# station with both a station node and entrances yields one POI at the station
# node.
ENTRANCE_STOP_TYPES = {"RSE", "FTD", "TMU", "BCE"}
# Categories whose entrances/variants are merged into a single station-level POI
# by normalized name + area (like Tube stations), so an RLY node and its RSE
# entrances collapse to one POI at the station node.
STATION_MERGE_CATEGORIES = {TUBE_STATION_CATEGORY, "Rail station", "Ferry"}
STATION_MERGE_CATEGORIES = {
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
"Rail station",
"Ferry",
"Bus station",
}
OUTPUT_COLUMNS = ["id", "name", "category", "lat", "lng"]
# Trailing entrance designators ("North East Ent", "Main Entrance No 2",
# "West Station Entrance", ...) are stripped from canonical names so a
# station's individually-named entrance nodes collapse into the station.
# A trailing run of filler words is only stripped when it contains at least
# one entrance word, so "Maze Hill North" or "Platform 1" are untouched.
_ENTRANCE_NAME_WORDS = {"ent", "entrance", "entrances", "access"}
_ENTRANCE_FILLER_WORDS = {
"north",
"south",
"east",
"west",
"ne",
"nw",
"se",
"sw",
"n",
"s",
"e",
"w",
"wt",
"main",
"side",
"no",
"station",
"stop",
"platform",
}
_ENTRANCE_WORDS_RE = "(?:ent|entrance|entrances|access)"
_ENTRANCE_FILLER_RE = (
r"(?:north|south|east|west|ne|nw|se|sw|n|s|e|w|wt|main|side|no|station|stop"
r"|platform|\d+)"
)
_ENTRANCE_SUFFIX_RE = (
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*"
rf"\s+{_ENTRANCE_WORDS_RE}"
rf"(?:\s+(?:{_ENTRANCE_FILLER_RE}|{_ENTRANCE_WORDS_RE}))*$"
)
# Bus-station bay/stand designators ("Stand A3", "Bay 2", "Stance 5") are
# stripped so every bay of one station shares a canonical name. The designator
# word must be followed by a short alphanumeric token, so place names ending in
# a bare "Bay" (Colwyn Bay, Herne Bay) are untouched.
_BAY_WORDS = {"stand", "stance", "bay", "gate"}
_BAY_SUFFIX_RE = r"\s+(?:stand|stance|bay|gate)\s+[a-z0-9]{1,3}$"
def _strip_entrance_suffix(words: list[str]) -> list[str]:
"""Drop a trailing entrance designator (direction/number filler around an
entrance word) from a tokenized stop name; no-op when no entrance word."""
idx = len(words)
saw_entrance = False
while idx > 0:
word = words[idx - 1]
if word in _ENTRANCE_NAME_WORDS:
saw_entrance = True
elif word.isdigit() or word in _ENTRANCE_FILLER_WORDS:
pass
else:
break
idx -= 1
return words[:idx] if saw_entrance else words
def canonical_station_name(name: str | None) -> str:
"""Normalize station names so entrances/transport-mode variants collapse."""
@ -55,18 +150,24 @@ def canonical_station_name(name: str | None) -> str:
normalized = re.sub(r"['`]", "", normalized)
normalized = normalized.replace("&", " and ")
normalized = re.sub(r"[^a-z0-9]+", " ", normalized)
words = normalized.split()
words = _strip_entrance_suffix(normalized.split())
if len(words) >= 3 and words[-2] in _BAY_WORDS and len(words[-1]) <= 3:
del words[-2:]
suffixes = (
("underground", "station"),
("tube", "station"),
("dlr", "station"),
("metro", "station"),
("metrolink", "station"),
("metrolink", "stop"),
("tram", "stop"),
("rail", "station"),
("railway", "station"),
("station",),
("stop",),
("metrolink",),
)
while True:
suffix = next(
@ -88,11 +189,14 @@ def canonical_station_name_expr(name_col: str = "name") -> pl.Expr:
expr = expr.str.replace_all(r"&", " and ")
expr = expr.str.replace_all(r"[^a-z0-9]+", " ")
expr = expr.str.replace_all(r"\s+", " ").str.strip_chars()
expr = expr.str.replace_all(_ENTRANCE_SUFFIX_RE, "")
expr = expr.str.replace_all(_BAY_SUFFIX_RE, "")
expr = expr.str.replace_all(
r"\s+(underground|tube|dlr|metro|rail|railway)\s+station$", ""
r"\s+(underground|tube|dlr|metro|metrolink|rail|railway)\s+station$", ""
)
expr = expr.str.replace_all(r"\s+tram\s+stop$", "")
expr = expr.str.replace_all(r"\s+(metrolink|tram)\s+stop$", "")
expr = expr.str.replace_all(r"\s+(station|stop)$", "")
expr = expr.str.replace_all(r"\s+metrolink$", "")
return expr.str.strip_chars()
@ -140,6 +244,7 @@ class StationAccumulator:
lat_sum: float
lng_sum: float
entrance: bool = False
is_lu: bool = False
count: int = 1
@property
@ -159,6 +264,7 @@ class StationAccumulator:
self.lat_sum += float(row["lat"])
self.lng_sum += float(row["lng"])
self.count += 1
self.is_lu = self.is_lu or bool(row.get("is_lu"))
name = str(row["name"] or "")
entrance = bool(row.get("entrance"))
@ -169,6 +275,16 @@ class StationAccumulator:
self.name = name
self.entrance = entrance
@property
def output_category(self) -> str:
# A merged tram/metro station is a genuine Tube station when ANY of its
# constituent nodes carries a London Underground ATCO code. Checking
# the whole group (not just the winning node) matters because LU
# entrance nodes often carry non-ZZLU codes (e.g. 4900VICT...).
if self.category == TRAM_METRO_CATEGORY and self.is_lu:
return TUBE_STATION_CATEGORY
return self.category
def _station_from_row(row: dict[str, object]) -> StationAccumulator:
return StationAccumulator(
@ -178,6 +294,7 @@ def _station_from_row(row: dict[str, object]) -> StationAccumulator:
lat_sum=float(row["lat"]),
lng_sum=float(row["lng"]),
entrance=bool(row.get("entrance")),
is_lu=bool(row.get("is_lu")),
)
@ -217,7 +334,7 @@ def _deduplicate_station_areas(df: pl.DataFrame) -> pl.DataFrame:
{
"id": [station.id for station in selected],
"name": [station.name for station in selected],
"category": [station.category for station in selected],
"category": [station.output_category for station in selected],
"lat": [station.lat for station in selected],
"lng": [station.lng for station in selected],
}
@ -258,10 +375,12 @@ def _deduplicate_local_stops(df: pl.DataFrame) -> pl.DataFrame:
def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
"""Deduplicate NaPTAN stops, merging station/terminal entrances by area.
Tube, rail and ferry POIs are merged to one record per station by
normalized name + area, with the primary station/terminal node (e.g. RLY,
FER) winning over an entrance node (RSE, FTD). Other stops are deduplicated
by exact name+category+locality.
Tram/metro, rail, ferry and bus-station POIs are merged to one record per
station by normalized name + area, with the primary station/terminal node
(e.g. RLY, FER, MET, BST) winning over an entrance node (RSE, FTD, TMU,
BCE). Merged tram/metro stations with a London Underground ATCO code in
the group become "Tube station". Other stops are deduplicated by exact
name+category+locality.
"""
station = df.filter(pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
other = df.filter(~pl.col("category").is_in(list(STATION_MERGE_CATEGORIES)))
@ -274,6 +393,29 @@ def deduplicate_naptan(df: pl.DataFrame) -> pl.DataFrame:
).select(OUTPUT_COLUMNS)
def filter_active_stops(df: pl.DataFrame) -> pl.DataFrame:
"""Keep only active NaPTAN stops.
The NaPTAN export's Status column marks stops as active/inactive/pending;
without this filter closed stations ("(closed)", "not in use") ship as
live POIs. Rows with a null Status are kept (benefit of the doubt); a
missing column is tolerated so older extracts still load.
"""
if "Status" not in df.columns:
print("WARNING: NaPTAN data has no Status column; keeping all stops")
return df
before = len(df)
df = df.filter(
pl.col("Status").is_null()
| pl.col("Status").str.strip_chars().str.to_lowercase().is_in(["active", "act"])
)
dropped = before - len(df)
if dropped:
print(f"Dropped {dropped:,} non-active stops (Status != active)")
return df
def download_naptan(output: Path) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
@ -291,7 +433,8 @@ def download_naptan(output: Path) -> None:
)
.drop_nulls(subset=["Latitude", "Longitude"])
.filter(pl.col("StopType").is_in(list(STOP_TYPES.keys())))
.select(
)
df = filter_active_stops(df).select(
pl.col("ATCOCode").alias("id"),
pl.col("CommonName").alias("name"),
pl.col("StopType").replace(STOP_TYPES).alias("category"),
@ -299,7 +442,10 @@ def download_naptan(output: Path) -> None:
pl.col("Longitude").alias("lng"),
pl.col("NptgLocalityCode").alias("locality"),
pl.col("StopType").is_in(list(ENTRANCE_STOP_TYPES)).alias("entrance"),
)
pl.col("ATCOCode")
.str.contains(LONDON_UNDERGROUND_ATCO_PATTERN)
.fill_null(False)
.alias("is_lu"),
)
before = len(df)

View file

@ -2,12 +2,15 @@
Downloads the OS Open Greenspace dataset as ESRI Shapefile and extracts
access point locations (park entrances). Each access point is tagged with
its parent site's function type (e.g. Public Park Or Garden). Sites without
access points fall back to polygon centroids.
its parent site's function type (e.g. Public Park Or Garden), the parent
site id and the site's polygon centroid. Sites without access points fall
back to polygon centroids.
Using access points rather than polygon centroids gives much more accurate
distance calculations a property next to Hyde Park won't show 400m just
because the centroid is in the middle of the park.
because the centroid is in the middle of the park. The site id / centroid
columns let downstream consumers (poi_proximity) collapse the frame back to
one row per SITE for counting, so a park with 30 gates counts as one park.
Source: https://osdatahub.os.uk/downloads/open/OpenGreenspace
License: Open Government Licence v3.0
@ -65,8 +68,8 @@ def _read_site_functions(shp_path: Path) -> dict[str, str]:
def _read_access_points(
shp_path: Path, site_funcs: dict[str, str]
) -> tuple[list[float], list[float], list[str]]:
"""Read access points, tagging each with its parent site's function."""
) -> tuple[list[float], list[float], list[str], list[str]]:
"""Read access points, tagging each with its parent site's function and id."""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
@ -80,6 +83,7 @@ def _read_access_points(
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
site_ids: list[str] = []
skipped = 0
error_skipped = 0
@ -107,6 +111,7 @@ def _read_access_points(
lats.append(lat)
lngs.append(lng)
categories.append(func)
site_ids.append(str(site_id))
if skipped:
print(f" Skipped {skipped:,} access points with unknown site ID")
@ -116,31 +121,26 @@ def _read_access_points(
error_skipped,
)
return lats, lngs, categories
return lats, lngs, categories, site_ids
def _read_site_centroids(
shp_path: Path, site_funcs: dict[str, str], covered_ids: set[str]
) -> tuple[list[float], list[float], list[str]]:
"""Read polygon centroids for sites that have no access points (fallback)."""
def _read_site_centroids(shp_path: Path) -> dict[str, tuple[float, float]]:
"""Compute the WGS84 polygon centroid of every greenspace site.
Used both as the representative point for site-level counting and as the
location fallback for sites that have no access points.
"""
reader = shp.Reader(str(shp_path), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
id_idx = _find_field(field_names, "id")
func_idx = _find_field(field_names, "funct")
if id_idx is None or func_idx is None:
return [], [], []
if id_idx is None:
return {}
lats: list[float] = []
lngs: list[float] = []
categories: list[str] = []
centroids: dict[str, tuple[float, float]] = {}
error_skipped = 0
for sr in reader.shapeRecords():
site_id = sr.record[id_idx]
if site_id in covered_ids:
continue
func = sr.record[func_idx]
try:
geom = to_shapely(sr.shape.__geo_interface__)
if geom.is_empty or not geom.is_valid:
@ -156,9 +156,7 @@ def _read_site_centroids(
)
continue
lats.append(lat)
lngs.append(lng)
categories.append(func)
centroids[str(site_id)] = (lat, lng)
if error_skipped:
logger.warning(
@ -166,7 +164,7 @@ def _read_site_centroids(
error_skipped,
)
return lats, lngs, categories
return centroids
def download_greenspace(output: Path) -> None:
@ -194,33 +192,53 @@ def download_greenspace(output: Path) -> None:
# Step 2: Read access points (primary — park entrances)
print(f"Reading {access_shps[0].name}...")
ap_lats, ap_lngs, ap_cats = _read_access_points(access_shps[0], site_funcs)
ap_lats, ap_lngs, ap_cats, ap_site_ids = _read_access_points(
access_shps[0], site_funcs
)
print(f" {len(ap_lats):,} access points loaded")
# Step 3: Fall back to centroids for sites without any access points
covered_ids = set()
reader = shp.Reader(str(access_shps[0]), encoding="latin-1")
field_names = [f[0] for f in reader.fields[1:]]
ref_idx = _find_field(field_names, "refto", "ref_to", "greensp")
if ref_idx is not None:
for rec in reader.iterRecords():
covered_ids.add(rec[ref_idx])
# Step 3: Compute every site's centroid: the representative point for
# site-level counting, and the location fallback for sites without any
# access points.
print("Computing site centroids...")
centroids = _read_site_centroids(site_shps[0])
print(f" {len(centroids):,} site centroids computed")
print("Adding centroids for sites without access points...")
fb_lats, fb_lngs, fb_cats = _read_site_centroids(
site_shps[0], site_funcs, covered_ids
)
covered_ids = set(ap_site_ids)
fb_lats: list[float] = []
fb_lngs: list[float] = []
fb_cats: list[str] = []
fb_site_ids: list[str] = []
for site_id, (lat, lng) in centroids.items():
if site_id in covered_ids:
continue
func = site_funcs.get(site_id)
if func is None:
continue
fb_lats.append(lat)
fb_lngs.append(lng)
fb_cats.append(func)
fb_site_ids.append(site_id)
print(f" {len(fb_lats):,} centroid fallbacks added")
lats = ap_lats + fb_lats
lngs = ap_lngs + fb_lngs
categories = ap_cats + fb_cats
site_ids = ap_site_ids + fb_site_ids
site_lats = [centroids.get(site_id, (None, None))[0] for site_id in site_ids]
site_lngs = [centroids.get(site_id, (None, None))[1] for site_id in site_ids]
df = pl.DataFrame(
{
"lat": np.array(lats, dtype=np.float64),
"lng": np.array(lngs, dtype=np.float64),
"category": categories,
"site_id": site_ids,
# Site polygon centroid (null when the centroid could not be
# computed): the representative point when collapsing to one row
# per site for counting.
"site_lat": pl.Series(site_lats, dtype=pl.Float64),
"site_lng": pl.Series(site_lngs, dtype=pl.Float64),
}
)

View file

@ -641,7 +641,7 @@ def _naptan_dlr_stations(naptan_path: Path) -> list[dict]:
match = _DLR_CODE_RE.search(atco_id)
if not match:
continue
if row["category"] not in {"Tube station", "Rail station"}:
if row["category"] not in {"Tube station", "Tram & Metro stop", "Rail station"}:
continue
code = match.group(1)

View file

@ -2,9 +2,12 @@ import polars as pl
import pytest
from pipeline.download.naptan import (
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
canonical_station_name,
canonical_station_name_expr,
deduplicate_naptan,
filter_active_stops,
)
@ -34,6 +37,127 @@ def test_canonical_station_name_expr_normalizes_transport_suffixes():
assert [canonical_station_name(name) for name in names] == result
def test_canonical_station_name_strips_entrance_suffixes():
# Real shipped NaPTAN entrance names that previously failed to merge with
# their station node (79 stray entrance POIs).
cases = {
"Weaste Metrolink Station North East Entrance": "weaste",
"Weaste Metrolink Station North Entrance No 2": "weaste",
"Whitefield Metrolink Station Main Entrance": "whitefield",
"Radcliffe Metrolink Station Entrance": "radcliffe",
"Stretford Metrolink Station Wt Platform Entrance": "stretford",
"Salford Quays Metrolink Station SW entrance": "salford quays",
"Bank Station Ent 2": "bank",
"Hainault": "hainault",
# The Metrolink MET node names collapse to the same key.
"Weaste (Manchester Metrolink)": "weaste",
# No entrance word: direction/filler words must NOT be stripped.
"Maze Hill North": "maze hill north",
"Bus Station Entrance": "bus",
# Bus-station bay/stand designators collapse to the station name…
"Tonypandy Bus Station Stand A3": "tonypandy bus",
"Caerphilly Interchange Stand 5": "caerphilly interchange",
"Stanley Bus Station Stand G": "stanley bus",
# …but a bare trailing "Bay" (place names) is untouched.
"Colwyn Bay": "colwyn bay",
}
for name, expected in cases.items():
assert canonical_station_name(name) == expected, name
df = pl.DataFrame({"name": list(cases.keys())})
expr_result = df.select(canonical_station_name_expr().alias("key"))["key"].to_list()
assert expr_result == list(cases.values())
def test_filter_active_stops_drops_non_active():
df = pl.DataFrame(
{
"ATCOCode": ["a", "b", "c", "d"],
"Status": ["active", "inactive", None, "Pending"],
}
)
result = filter_active_stops(df)
# Active and unknown (null) statuses survive; inactive/pending are dropped.
assert result["ATCOCode"].to_list() == ["a", "c"]
def test_filter_active_stops_tolerates_missing_status_column():
df = pl.DataFrame({"ATCOCode": ["a"]})
assert filter_active_stops(df)["ATCOCode"].to_list() == ["a"]
def test_deduplicate_naptan_splits_london_underground_from_tram_metro():
# MET station nodes plus TMU entrances, pre-categorised as the tram/metro
# family. The Hainault group contains a 940GZZLU station node, so the
# merged POI is a genuine "Tube station" even though its entrance carries a
# non-ZZLU ATCO code; the Metrolink group stays "Tram & Metro stop".
df = pl.DataFrame(
{
"id": [
"940GZZLUHLT",
"490000095003",
"9400ZZMAWST",
"1800NFR2691",
],
"name": [
"Hainault Underground Station",
"Hainault",
"Weaste (Manchester Metrolink)",
"Weaste Metrolink Station North West Entrance",
],
"category": [TRAM_METRO_CATEGORY] * 4,
"lat": [51.6034, 51.6037, 53.4826, 53.4826],
"lng": [0.0933, 0.0931, -2.3087, -2.3086],
"locality": [None, None, None, None],
"entrance": [False, True, False, True],
"is_lu": [True, False, False, False],
}
)
result = deduplicate_naptan(df).sort("category")
assert len(result) == 2
assert result["category"].to_list() == [
TRAM_METRO_CATEGORY,
TUBE_STATION_CATEGORY,
]
tube = result.filter(pl.col("category") == TUBE_STATION_CATEGORY)
# The station node (not the entrance) represents the merged POI.
assert tube["id"][0] == "940GZZLUHLT"
tram = result.filter(pl.col("category") == TRAM_METRO_CATEGORY)
assert tram["id"][0] == "9400ZZMAWST"
def test_deduplicate_naptan_merges_bus_station_bays_and_entrances():
# BCS bays and a BCE entrance of one bus station collapse to a single POI
# represented by a non-entrance node; a different bus station in another
# area survives separately.
df = pl.DataFrame(
{
"id": ["bay-1", "bay-2", "ent-1", "other"],
"name": [
"Bury Interchange",
"Bury Interchange",
"Bury Interchange East Entrance",
"Rochdale Interchange",
],
"category": ["Bus station"] * 4,
"lat": [53.5907, 53.5908, 53.5909, 53.6160],
"lng": [-2.2958, -2.2957, -2.2956, -2.1561],
"locality": ["BURY", "BURY", "BURY", "ROCHDALE"],
"entrance": [False, False, True, False],
}
)
result = deduplicate_naptan(df).sort("name")
assert result["name"].to_list() == ["Bury Interchange", "Rochdale Interchange"]
assert result.filter(pl.col("name") == "Bury Interchange")["id"][0] == "bay-1"
def test_deduplicate_naptan_merges_tube_station_variants_by_area():
df = pl.DataFrame(
{

View file

@ -86,7 +86,7 @@ def test_naptan_dlr_stations_are_deduplicated_by_atco_code(tmp_path):
"Bank",
],
"category": [
"Tube station",
"Tram & Metro stop",
"Tube station",
"Rail station",
"Bus stop",

View file

@ -1,11 +1,15 @@
"""Tests for transit_network GTFS processing."""
import datetime as dt
import zipfile
from pathlib import Path
import pytest
from pipeline.download.transit_network import convert_high_freq_to_frequency_based
from pipeline.download.transit_network import (
convert_high_freq_to_frequency_based,
validate_gtfs_feed,
)
def _write_gtfs(path: Path, *, stop_times: str) -> None:
@ -77,3 +81,162 @@ def test_raises_when_no_first_stops_found(tmp_path: Path) -> None:
with pytest.raises(RuntimeError, match="no first stops"):
convert_high_freq_to_frequency_based(src, dst)
# ── validate_gtfs_feed ────────────────────────────────────────────────────────
TODAY = dt.date(2026, 6, 10)
def _make_gtfs(
path: Path,
*,
calendar: str | None = (
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20260101,20271231\n"
),
calendar_dates: str | None = None,
stops: str = (
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Bank,51.5133,-0.0886\n"
"STOP_B,Liverpool Street,51.5178,-0.0823\n"
),
routes: str = "route_id,agency_id,route_short_name,route_type\nR1,OP1,Central,1\n",
trips: str = "trip_id,route_id,service_id\nT1,R1,S1\n",
stop_times: str = (
"trip_id,stop_sequence,departure_time,stop_id\n"
"T1,0,06:00:00,STOP_A\n"
"T1,1,06:02:00,STOP_B\n"
),
) -> Path:
"""Write a tiny synthetic GTFS zip; defaults form a valid current feed."""
with zipfile.ZipFile(path, "w") as z:
if calendar is not None:
z.writestr("calendar.txt", calendar)
if calendar_dates is not None:
z.writestr("calendar_dates.txt", calendar_dates)
z.writestr("stops.txt", stops)
z.writestr("routes.txt", routes)
z.writestr("trips.txt", trips)
z.writestr("stop_times.txt", stop_times)
return path
def test_validate_gtfs_feed_happy_path(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip")
validate_gtfs_feed(feed, "test feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_expired_calendar(tmp_path: Path) -> None:
"""The 2010 TfL snapshot failure mode: all calendars ended years ago."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
)
with pytest.raises(RuntimeError, match=r"'stale tfl'.*no service active"):
validate_gtfs_feed(feed, "stale tfl", today=TODAY)
def test_validate_gtfs_feed_calendar_starting_after_window_fails(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20270101,20271231\n"
),
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "future feed", today=TODAY)
def test_validate_gtfs_feed_calendar_dates_rescues_expired_calendar(
tmp_path: Path,
) -> None:
"""An expired calendar.txt passes if calendar_dates.txt adds service now."""
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=(
"service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,"
"start_date,end_date\n"
"S1,1,1,1,1,1,0,0,20091201,20101224\n"
),
calendar_dates="service_id,date,exception_type\nS1,20260615,1\n",
)
validate_gtfs_feed(feed, "rescued feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_removed_service_exception_does_not_count(
tmp_path: Path,
) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
calendar=None,
calendar_dates="service_id,date,exception_type\nS1,20260615,2\n",
)
with pytest.raises(RuntimeError, match="no service active"):
validate_gtfs_feed(feed, "removed-only feed", today=TODAY)
def test_validate_gtfs_feed_zero_and_empty_coords(tmp_path: Path) -> None:
"""The 2010 TfL snapshot's other failure mode: empty or 0,0 stop coords."""
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,Nowhere,0,0\n"
"STOP_B,Blank,,\n"
),
)
with pytest.raises(RuntimeError, match=r"plausible UK coordinates"):
validate_gtfs_feed(feed, "coordless feed", today=TODAY)
def test_validate_gtfs_feed_non_uk_coords_fail(tmp_path: Path) -> None:
feed = _make_gtfs(
tmp_path / "feed.zip",
stops=(
"stop_id,stop_name,stop_lat,stop_lon\n"
"STOP_A,New York,40.71,-74.0\n"
"STOP_B,Sydney,-33.87,151.21\n"
),
)
with pytest.raises(RuntimeError, match="plausible UK coordinates"):
validate_gtfs_feed(feed, "abroad feed", today=TODAY)
def test_validate_gtfs_feed_minority_bad_coords_pass(tmp_path: Path) -> None:
"""One bad stop out of 30 (3.3%) stays under the 5% tolerance."""
rows = [f"STOP_{i},Stop {i},51.5,{-0.1 + i * 0.001}\n" for i in range(29)]
rows.append("STOP_BAD,Broken,0,0\n")
feed = _make_gtfs(
tmp_path / "feed.zip",
stops="stop_id,stop_name,stop_lat,stop_lon\n" + "".join(rows),
)
validate_gtfs_feed(feed, "mostly good feed", today=TODAY) # must not raise
def test_validate_gtfs_feed_empty_trips(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", trips="trip_id,route_id,service_id\n")
with pytest.raises(RuntimeError, match="trips.txt has no data rows"):
validate_gtfs_feed(feed, "tripless feed", today=TODAY)
def test_validate_gtfs_feed_missing_calendar_files(tmp_path: Path) -> None:
feed = _make_gtfs(tmp_path / "feed.zip", calendar=None)
with pytest.raises(RuntimeError, match="neither calendar.txt nor calendar_dates"):
validate_gtfs_feed(feed, "calendarless feed", today=TODAY)
def test_validate_gtfs_feed_not_a_zip(tmp_path: Path) -> None:
bogus = tmp_path / "feed.zip"
bogus.write_text("not a zip")
with pytest.raises(RuntimeError, match="not a valid zip"):
validate_gtfs_feed(bogus, "bogus feed", today=TODAY)

View file

@ -2,24 +2,32 @@
Downloads:
- England OSM PBF from Geofabrik (~1.5GB)
- BODS GTFS from Bus Open Data Service (~1.5GB, all England bus/tram/ferry)
- TfL TransXChange timetables converted to GTFS
- National Rail CIF timetable converted to GTFS (requires credentials)
- BODS GTFS from Bus Open Data Service (~1.5GB; all England bus/tram/ferry,
plus London Underground, DLR, London Tramlink and the IFS Cloud Cable Car)
- National Rail CIF timetable converted to GTFS (requires credentials;
includes the Elizabeth line, TOC "XR")
Then processes for R5 compatibility:
- Cleans BODS GTFS (fixes stop_times >72h, feed_info year >2100)
- Converts high-frequency metro/tram services to frequency-based GTFS
- Converts TfL TransXChange to GTFS via transxchange2gtfs
- Converts National Rail CIF to GTFS via dtd2mysql (requires MariaDB Docker)
- Validates every produced GTFS zip (active calendar window, plausible UK
stop coordinates, non-empty routes/trips/stop_times)
Requires: osmium-tool, Node.js (npx), Docker (for national rail)
Note: the legacy TfL TransXChange feed (tfl.gov.uk journey-planner-timetables)
was removed: that URL serves a 2010-10-28 snapshot whose calendars all expired
in 2010 and whose stops have empty/0,0 coordinates, so it contributed zero
service. BODS covers all TfL modes that feed nominally provided.
Requires: osmium-tool, Docker (for national rail)
Output directory: property-data/transit/
raw/england.osm.pbf + bods_gtfs.zip + tfl_gtfs.zip + national_rail_gtfs.zip
raw/england.osm.pbf + bods_gtfs.zip + national_rail_gtfs.zip
"""
import argparse
import csv
import datetime as dt
import io
import json
import os
@ -45,20 +53,18 @@ ENGLAND_PBF_URL = (
# Bus Open Data Service — pre-converted GTFS covering all England bus/tram/ferry
BODS_GTFS_URL = "https://data.bus-data.dft.gov.uk/timetable/download/gtfs-file/all/"
# TfL TransXChange timetables (tube, DLR, tram, buses, river bus, cable car)
TFL_TRANSXCHANGE_URL = (
"https://tfl.gov.uk/cdn/static/cms/documents/journey-planner-timetables.zip"
)
# NaPTAN stops data — needed by transxchange2gtfs (its built-in URL is broken)
NAPTAN_URL = "https://naptan.api.dft.gov.uk/v1/access-nodes?dataFormat=csv"
# National Rail Open Data API
NR_AUTH_URL = "https://opendata.nationalrail.co.uk/authenticate"
NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/timetable"
USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"
TRANSXCHANGE2GTFS_PACKAGE = "transxchange2gtfs@1.12.0"
# GTFS validation: a feed must have service within this many days of the build
# date, and at least this fraction of stops must have plausible UK coordinates.
GTFS_CALENDAR_LOOKAHEAD_DAYS = 60
GTFS_MIN_VALID_STOP_FRACTION = 0.95
UK_LAT_RANGE = (49.0, 61.0)
UK_LON_RANGE = (-9.0, 2.5)
def _download_http(
@ -468,89 +474,175 @@ def convert_high_freq_to_frequency_based(
print(f" Saved to {dst}")
def download_tfl_transxchange(raw_dir: Path) -> Path:
"""Download TfL TransXChange timetable bundle."""
dest = raw_dir / "tfl_transxchange.zip"
if dest.exists():
print(f"TfL TransXChange already exists: {dest}")
return dest
print("Downloading TfL TransXChange timetables...")
_download_http(TFL_TRANSXCHANGE_URL, dest, desc="tfl_transxchange.zip")
return dest
def _gtfs_has_data_row(z: zipfile.ZipFile, filename: str) -> bool:
"""True if a GTFS file has at least one non-empty data row after the header."""
with z.open(filename) as f:
f.readline() # header
for line in f:
if _parse_csv_line(line):
return True
return False
def download_naptan() -> None:
"""Download NaPTAN stops to the local temp dir for transxchange2gtfs."""
dest = local_tmp_dir() / "Stops.csv"
if dest.exists():
print(f"NaPTAN Stops.csv already exists: {dest}")
return
def _calendar_active_in_window(
z: zipfile.ZipFile, names: set[str], window_start: int, window_end: int
) -> bool:
"""True if calendar.txt/calendar_dates.txt have service in [start, end].
print("Downloading NaPTAN stops data...")
_download_http(NAPTAN_URL, dest, desc="Stops.csv")
def convert_tfl_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
"""Convert TfL TransXChange to GTFS using transxchange2gtfs."""
dest = output_dir / "tfl_gtfs.zip"
if dest.exists():
print(f"TfL GTFS already exists: {dest}")
return dest
txc_path = raw_dir / "tfl_transxchange.zip"
# Ensure NaPTAN is available (transxchange2gtfs has a broken download URL)
download_naptan()
print("Converting TfL TransXChange → GTFS...")
# The shim patches known packaging/runtime issues in the pinned npm package
# before loading its CLI from npx's temporary install.
shim_path = Path(__file__).with_name("transxchange2gtfs_shim.js")
subprocess.run(
[
"npx",
"--yes",
"--package",
TRANSXCHANGE2GTFS_PACKAGE,
"sh",
"-c",
"\n".join(
[
'bin="$(command -v transxchange2gtfs)"',
'script="$(readlink -f "$bin")"',
'pkg_dir="$(dirname "$(dirname "$script")")"',
'shim="$1"',
"shift",
'exec node "$shim" "$pkg_dir" "$@"',
]
),
"transxchange2gtfs",
str(shim_path.resolve()),
str(txc_path.resolve()),
str(dest.resolve()),
],
check=True,
Dates are compared as YYYYMMDD integers. A calendar.txt row counts when its
date range overlaps the window AND at least one weekday flag is set; a
calendar_dates.txt row counts when it adds service (exception_type=1) on a
date inside the window.
"""
weekdays = (
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
)
if "calendar.txt" in names:
with z.open("calendar.txt") as f:
cols = _parse_csv_line(f.readline())
try:
start_idx = cols.index("start_date")
end_idx = cols.index("end_date")
except ValueError:
return False
day_idxs = [cols.index(d) for d in weekdays if d in cols]
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
try:
start = int(parts[start_idx].strip('"'))
end = int(parts[end_idx].strip('"'))
except (ValueError, IndexError):
continue
if start > window_end or end < window_start:
continue
if day_idxs and not any(
parts[i].strip('"') == "1" for i in day_idxs if i < len(parts)
):
continue
return True
if "calendar_dates.txt" in names:
with z.open("calendar_dates.txt") as f:
cols = _parse_csv_line(f.readline())
try:
date_idx = cols.index("date")
exc_idx = cols.index("exception_type")
except ValueError:
return False
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
try:
date = int(parts[date_idx].strip('"'))
except (ValueError, IndexError):
continue
if exc_idx < len(parts) and parts[exc_idx].strip('"') != "1":
continue
if window_start <= date <= window_end:
return True
return False
def validate_gtfs_feed(path: Path, feed_name: str, *, today: dt.date | None = None) -> None:
"""Sanity-check a produced/downloaded GTFS zip; raise RuntimeError if dead.
Guards against silently shipping a feed that contributes zero service (as
the old TfL dump did: 2010 calendars, empty/0,0 stop coordinates). Checks:
(a) calendar.txt/calendar_dates.txt have at least one service active
within [today, today + GTFS_CALENDAR_LOOKAHEAD_DAYS];
(b) stops.txt is non-empty and >= GTFS_MIN_VALID_STOP_FRACTION of stops
have plausible UK coordinates (lat 49-61, lon -9..2.5, not 0,0);
(c) routes.txt, trips.txt and stop_times.txt each have data rows.
"""
if today is None:
today = dt.date.today()
window_start = int(today.strftime("%Y%m%d"))
window_end = int(
(today + dt.timedelta(days=GTFS_CALENDAR_LOOKAHEAD_DAYS)).strftime("%Y%m%d")
)
def fail(reason: str) -> None:
raise RuntimeError(
f"GTFS validation failed for feed '{feed_name}' ({path}): {reason}"
)
print(f"Validating GTFS feed '{feed_name}'...")
if not path.exists() or not zipfile.is_zipfile(path):
fail("not a valid zip file")
with zipfile.ZipFile(path) as z:
names = set(z.namelist())
# (c) core files present and non-empty
for required in ("routes.txt", "trips.txt", "stop_times.txt", "stops.txt"):
if required not in names:
fail(f"missing {required}")
if not _gtfs_has_data_row(z, required):
fail(f"{required} has no data rows")
# (a) at least one service active in the routing window
if "calendar.txt" not in names and "calendar_dates.txt" not in names:
fail("has neither calendar.txt nor calendar_dates.txt")
if not _calendar_active_in_window(z, names, window_start, window_end):
fail(
f"no service active between {window_start} and {window_end}"
"the feed's calendars are stale/expired and it would contribute "
"zero service to routing"
)
# (b) stops have plausible UK coordinates
total_stops = 0
valid_stops = 0
with z.open("stops.txt") as f:
cols = _parse_csv_line(f.readline())
try:
lat_idx = cols.index("stop_lat")
lon_idx = cols.index("stop_lon")
except ValueError:
fail("stops.txt is missing stop_lat/stop_lon columns")
for line in f:
parts = _parse_csv_line(line)
if not parts:
continue
total_stops += 1
try:
lat = float(parts[lat_idx].strip('"'))
lon = float(parts[lon_idx].strip('"'))
except (ValueError, IndexError):
continue # empty/garbage coordinate → invalid
if lat == 0.0 and lon == 0.0:
continue
if (
UK_LAT_RANGE[0] <= lat <= UK_LAT_RANGE[1]
and UK_LON_RANGE[0] <= lon <= UK_LON_RANGE[1]
):
valid_stops += 1
if total_stops == 0:
fail("stops.txt has no stops")
fraction = valid_stops / total_stops
if fraction < GTFS_MIN_VALID_STOP_FRACTION:
fail(
f"only {valid_stops}/{total_stops} stops "
f"({fraction:.1%}) have plausible UK coordinates "
f"(lat {UK_LAT_RANGE[0]}-{UK_LAT_RANGE[1]}, "
f"lon {UK_LON_RANGE[0]}..{UK_LON_RANGE[1]}, non-null, not 0,0); "
f"need >= {GTFS_MIN_VALID_STOP_FRACTION:.0%}"
)
print(
f" OK: service active in window, {valid_stops}/{total_stops} stops "
f"({fraction:.1%}) with plausible UK coordinates"
)
required_files = {
"agency.txt",
"calendar.txt",
"calendar_dates.txt",
"routes.txt",
"stop_times.txt",
"stops.txt",
"trips.txt",
}
if not dest.exists() or not zipfile.is_zipfile(dest):
raise RuntimeError(f"transxchange2gtfs did not create a valid GTFS zip: {dest}")
with zipfile.ZipFile(dest) as z:
missing = required_files - set(z.namelist())
if missing:
missing_str = ", ".join(sorted(missing))
raise RuntimeError(f"TfL GTFS zip is missing required files: {missing_str}")
size_mb = dest.stat().st_size / (1024 * 1024)
print(f" Saved to {dest} ({size_mb:.1f} MB)")
return dest
def download_national_rail_cif(raw_dir: Path) -> Path | None:
@ -1007,18 +1099,15 @@ def main() -> None:
required=True,
help="Output directory for transit data",
)
parser.add_argument(
"--skip-tfl",
action="store_true",
help="Skip TfL TransXChange download and conversion",
)
args = parser.parse_args()
output_dir: Path = args.output
raw_dir = output_dir / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
# 1. Download, clean, and frequency-convert BODS GTFS
# 1. Download, clean, and frequency-convert BODS GTFS. BODS covers all
# England bus/tram/ferry plus London Underground, DLR, London Tramlink and
# the IFS Cloud Cable Car, so no separate TfL feed is needed.
download_osm_pbf(raw_dir)
bods_raw = download_bods_gtfs(raw_dir)
@ -1027,16 +1116,10 @@ def main() -> None:
bods_final = output_dir / "bods_gtfs.zip"
convert_high_freq_to_frequency_based(bods_cleaned, bods_final)
validate_gtfs_feed(bods_final, "BODS GTFS")
# 2. TfL TransXChange → GTFS
if args.skip_tfl:
print("Skipping TfL (--skip-tfl)")
else:
download_tfl_transxchange(raw_dir)
convert_tfl_to_gtfs(raw_dir, output_dir)
# 3. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
# reach the ~2,725 railway-station destinations, so a bus/TfL-only network
# 2. National Rail CIF → GTFS. Heavy rail is mandatory: trains are how people
# reach the ~2,725 railway-station destinations, so a bus/metro-only network
# silently overstates every train commute. Missing credentials are a HARD
# error, so a rail-less network can never ship.
cif = download_national_rail_cif(raw_dir)
@ -1048,7 +1131,8 @@ def main() -> None:
"required; without it the transit network models every train journey "
"as bus-only and overstates commute times."
)
convert_national_rail_to_gtfs(raw_dir, output_dir)
nr_final = convert_national_rail_to_gtfs(raw_dir, output_dir)
validate_gtfs_feed(nr_final, "National Rail GTFS")
# Summary
print()

View file

@ -1,106 +0,0 @@
#!/usr/bin/env node
"use strict";
const fs = require("fs");
const path = require("path");
const { createRequire } = require("module");
const [pkgDirArg, ...converterArgs] = process.argv.slice(2);
if (!pkgDirArg || converterArgs.length < 2) {
console.error(
"Usage: transxchange2gtfs_shim.js <package-dir> <input...> <output>",
);
process.exit(2);
}
const pkgDir = path.resolve(pkgDirArg);
const defaultTmpDir = path.resolve(__dirname, "..", "..", ".tmp");
const localTmpDir =
process.env.TMPDIR || process.env.TEMP || process.env.TMP || defaultTmpDir;
const stopsCsv = path.join(localTmpDir, "Stops.csv");
const converterTmpPrefix = path.join(localTmpDir, "transxchange2gtfs_");
const converterTmpPatch =
`static TMP = ${JSON.stringify(converterTmpPrefix)}` +
` + process.pid + ${JSON.stringify(path.sep)};`;
fs.mkdirSync(localTmpDir, { recursive: true });
function replaceOnce(relativePath, before, after) {
const file = path.join(pkgDir, relativePath);
const original = fs.readFileSync(file, "utf8");
if (original.includes(before)) {
fs.writeFileSync(file, original.replace(before, after));
} else if (original.includes(after)) {
return;
} else {
throw new Error(`Could not patch ${relativePath}: expected text not found`);
}
}
// The published 1.12.0 package has a few compatibility issues with current
// TfL TransXChange exports:
// - the bin script points at dist/src/cli.js, but the package ships dist/cli.js
// - the compiled date-holidays import expects a synthetic default export
// - some TfL journeys reference timing links without matching route-link geometry
//
// GTFS shapes are optional for R5 routing. Clear shape references and omit
// shapes.txt so missing route geometry does not drop otherwise usable trips.
function patchPackage() {
replaceOnce(
"dist/Container.js",
"static TMP = `/tmp/transxchange2gtfs_${process.pid}/`;",
converterTmpPatch,
);
replaceOnce(
"dist/Container.js",
'fs.existsSync("/tmp/Stops.csv")',
`fs.existsSync(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/Container.js",
'fs.createReadStream("/tmp/Stops.csv", "utf8")',
`fs.createReadStream(${JSON.stringify(stopsCsv)}, "utf8")`,
);
replaceOnce(
"dist/converter/GetStopData.js",
'fs.createWriteStream("/tmp/Stops.csv")',
`fs.createWriteStream(${JSON.stringify(stopsCsv)})`,
);
replaceOnce(
"dist/transxchange/TransXChangeJourneyStream.js",
"distanceSoFarM += routeLink.Distance;",
"distanceSoFarM += routeLink ? routeLink.Distance : 0;",
);
replaceOnce(
"dist/gtfs/TripsStream.js",
"(0, crypto_1.createHash)('md5').update(JSON.stringify({ routeId: journey.route, routeLinkSeq: journey.routeLinkIds })).digest(\"hex\"));",
"\"\");",
);
replaceOnce(
"dist/gtfs/StopTimesStream.js",
"stop.shapeDistTraveled, stop.exactTime ? \"1\" : \"0\");",
"\"\", stop.exactTime ? \"1\" : \"0\");",
);
replaceOnce(
"dist/Container.js",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex)),\n \"shapes.txt\": journeyStream.pipe(new ShapesStream_1.ShapesStream())",
"\"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
replaceOnce(
"dist/Container.js",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"transfers.txt\": transxchange.pipe(new TransfersStream_1.TransfersStream(naptanIndex, locationIndex)),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
"\"routes.txt\": transxchange.pipe(new RoutesStream_1.RoutesStream()),\n \"stops.txt\": transxchange.pipe(new StopsStream_1.StopsStream(naptanIndex))",
);
}
patchPackage();
const pkgRequire = createRequire(path.join(pkgDir, "package.json"));
const Holidays = pkgRequire("date-holidays");
if (!Holidays.default) {
Holidays.default = Holidays;
}
process.argv = [process.argv[0], "transxchange2gtfs", ...converterArgs];
require(path.join(pkgDir, "dist", "cli.js"));

View file

@ -109,6 +109,27 @@ def _clean_number(column: str, dtype: pl.DataType) -> pl.Expr:
return _clean_string(column).cast(dtype, strict=False)
def _join_address_parts(*columns: str) -> pl.Expr:
"""Join address components into one display address, single-spaced.
Price-paid SAON/PAON/STREET are EMPTY STRINGS (not null) when absent
saon is "" on ~88% of rows and ``concat_str(..., ignore_nulls=True)``
skips only nulls, so empty components still contributed their separator
(``' 10 PALACE GREEN'``, doubled spaces when a middle part was empty).
Convert ``''``null per component so ignore_nulls works as intended, then
defensively collapse residual whitespace runs and strip the result. A
fully-empty address becomes null (dropped by the downstream
``pp_address.is_not_null()`` filter) instead of whitespace junk.
"""
joined = pl.concat_str(
[_clean_string(column) for column in columns],
separator=" ",
ignore_nulls=True,
)
cleaned = joined.str.replace_all(r"\s+", " ").str.strip_chars()
return pl.when(cleaned == "").then(None).otherwise(cleaned)
def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
return (
raw.select(
@ -436,11 +457,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
)
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
).alias("pp_address"),
_join_address_parts("saon", "paon", "street").alias("pp_address"),
)
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),

View file

@ -102,15 +102,11 @@ _AREA_COLUMNS = [
# is postcode-grain: it belongs in the area output (one value per postcode,
# covering property-less postcodes too) rather than duplicated per property.
TREE_DENSITY_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
"Good+ primary schools within 2km",
"Good+ secondary schools within 2km",
"Outstanding primary schools within 5km",
"Outstanding secondary schools within 5km",
"Outstanding primary schools within 2km",
"Outstanding secondary schools within 2km",
# Schools (modelled historical catchment areas covering the postcode)
"Good+ primary school catchments",
"Good+ secondary school catchments",
"Outstanding primary school catchments",
"Outstanding secondary school catchments",
# Demographics
"Median age",
# Politics
@ -172,14 +168,10 @@ _FINAL_RENAME_COLUMNS = {
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"good_primary_catchments": "Good+ primary school catchments",
"good_secondary_catchments": "Good+ secondary school catchments",
"outstanding_primary_catchments": "Outstanding primary school catchments",
"outstanding_secondary_catchments": "Outstanding secondary school catchments",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
@ -874,7 +866,7 @@ def _join_area_side_tables(
election: pl.LazyFrame,
poi_counts: pl.LazyFrame,
noise: pl.LazyFrame,
school_proximity: pl.LazyFrame,
school_catchments: pl.LazyFrame,
conservation_areas: pl.LazyFrame,
tree_density: pl.LazyFrame | None,
broadband: pl.LazyFrame,
@ -905,7 +897,7 @@ def _join_area_side_tables(
base = base.join(election, on="pcon", how="left")
base = base.join(poi_counts, on="postcode", how="left")
base = base.join(noise, on="postcode", how="left")
base = base.join(school_proximity, on="postcode", how="left")
base = base.join(school_catchments, on="postcode", how="left")
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
@ -1970,7 +1962,7 @@ def _build(
ethnicity_path: Path,
crime_path: Path,
noise_path: Path,
school_proximity_path: Path,
school_catchments_path: Path,
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
@ -2080,7 +2072,7 @@ def _build(
)
.select("postcode", "noise_lden_db")
)
school_proximity = pl.scan_parquet(school_proximity_path)
school_catchments = pl.scan_parquet(school_catchments_path)
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
@ -2120,7 +2112,7 @@ def _build(
"election": election,
"poi_counts": poi_counts,
"noise": noise,
"school_proximity": school_proximity,
"school_catchments": school_catchments,
"conservation_areas": conservation_areas,
"tree_density": tree_density,
"broadband": broadband,
@ -2267,10 +2259,10 @@ def main():
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
)
parser.add_argument(
"--school-proximity",
"--school-catchments",
type=Path,
required=True,
help="School proximity counts parquet file",
help="School catchment counts parquet file",
)
parser.add_argument(
"--broadband",
@ -2376,7 +2368,7 @@ def main():
ethnicity_path=args.ethnicity,
crime_path=args.crime,
noise_path=args.noise,
school_proximity_path=args.school_proximity,
school_catchments_path=args.school_catchments,
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,

View file

@ -25,11 +25,30 @@ POI_GROUPS_2KM = {
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
GROCERIES_GROUP = "Groceries"
# Groceries categories EXCLUDED from the static "Number of grocery shops and
# supermarkets within 2km" metric. Bakeries, butchers, delis and off-licences
# are speciality food retail, not somewhere you do a grocery shop; together
# they were ~a third of the group and inflated the headline count. The metric
# keeps Supermarket, Convenience Store, Greengrocer and every GEOLYTIX brand.
GROCERY_STATIC_EXCLUDED_CATEGORIES = {
"Bakery",
"Butcher & Fishmonger",
"Deli & Specialty",
"Off-Licence",
}
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
# Scope: "Public Park Or Garden" is the core park function. "Playing Field"
# (open public recreation grounds) is borderline but kept: outside big cities
# the local rec ground is the de facto park. "Play Space" (playgrounds) is
# excluded — a playground is not a park, and "Playground" is already its own
# OSM-derived category. The remaining functions (Religious Grounds, Golf
# Course, Cemetery, Allotments, Bowling Green, Tennis Court, Other Sports
# Facility) are clearly not parks.
GREENSPACE_PARK_FUNCTIONS = {
"parks": ["Public Park Or Garden", "Playing Field", "Play Space"],
"parks": ["Public Park Or Garden", "Playing Field"],
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
@ -50,17 +69,22 @@ def _poi_category_slug(category: str) -> str:
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
"""Return the distinct `category` values for the Groceries group.
"""Return the distinct `category` values for the static groceries metric.
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
with group "Groceries"; it never emits the literal "Supermarket". Collecting
every Groceries category captures both the OSM strings and the brand names.
Speciality food retail (bakeries, butchers, delis, off-licences) is
excluded see GROCERY_STATIC_EXCLUDED_CATEGORIES.
"""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
return (
pois.filter(pl.col("group") == GROCERIES_GROUP)
pois.filter(
(pl.col("group") == GROCERIES_GROUP)
& ~pl.col("category").is_in(list(GROCERY_STATIC_EXCLUDED_CATEGORIES))
)
.select("category")
.unique()
.sort("category")
@ -109,6 +133,40 @@ def _build_poi_category_groups(
return groups, display_names
def _greenspace_count_frame(greenspace: pl.DataFrame) -> pl.DataFrame:
"""Collapse the greenspace frame to ONE representative row per site.
os_greenspace.parquet is one row per ACCESS POINT (park gate), which is the
right grain for nearest-distance (the nearest gate is what matters) but
wildly over-counts "Number of amenities (Park) within Xkm" a large park
with 30 gates counted as 30 parks. Counting uses one row per site at the
site centroid (falling back to the first access point when no centroid is
available). Degrades gracefully: a legacy parquet without `site_id` is
returned unchanged (gate-grain counts) rather than crashing.
"""
if "site_id" not in greenspace.columns:
print(
"WARNING: greenspace parquet has no site_id column; park counts "
"will count access points, not sites (regenerate os_greenspace)"
)
return greenspace
keyed = greenspace.filter(pl.col("site_id").is_not_null())
unkeyed = greenspace.filter(pl.col("site_id").is_null())
representatives = keyed.unique(subset=["site_id"], keep="first")
if {"site_lat", "site_lng"}.issubset(greenspace.columns):
representatives = representatives.with_columns(
pl.coalesce([pl.col("site_lat"), pl.col("lat")]).alias("lat"),
pl.coalesce([pl.col("site_lng"), pl.col("lng")]).alias("lng"),
)
frames = [representatives.select(greenspace.columns)]
if len(unkeyed) > 0:
frames.append(unkeyed)
return pl.concat(frames)
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
renames: dict[str, str] = {}
for group_key, category in display_names.items():
@ -185,13 +243,16 @@ def main():
# Park counts and distances from OS Open Greenspace. They use the dynamic
# amenity metric names so filters read through the same side-table path as
# OSM-derived amenity metrics.
# OSM-derived amenity metrics. Distances use the access-point grain (the
# nearest park GATE is the right semantics); counts use one row per SITE so
# a park with many gates counts once.
greenspace = pl.read_parquet(args.greenspace)
greenspace_sites = _greenspace_count_frame(greenspace)
park_counts_2km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
)
park_counts_5km = count_pois_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
postcodes, greenspace_sites, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
)
park_distances = min_distance_per_postcode(
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS

View file

@ -260,6 +260,12 @@ def main() -> None:
)
args = parser.parse_args()
if args.greenspace and not args.greenspace.exists():
# Fail loudly and EARLY (before the ~10h Phases 1-3): silently skipping
# the subtraction is exactly how parks/lakes shipped inside postcode
# boundaries unnoticed.
raise SystemExit(f"--greenspace file not found: {args.greenspace}")
fragments_cache = args.output / "fragments_cache.parquet"
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
# so a greenspace change must not invalidate the fragment cache.
@ -294,7 +300,7 @@ def main() -> None:
greenspace_tree = None
greenspace_geoms = None
if args.greenspace and args.greenspace.exists():
if args.greenspace:
from .greenspace import load_greenspace
print(f" Loading greenspace/water from {args.greenspace}...")

View file

@ -3,7 +3,7 @@
from pathlib import Path
import polars as pl
from shapely import wkb
from shapely import make_valid, wkb
from shapely.geometry import MultiPolygon, Polygon
from shapely.strtree import STRtree
@ -13,12 +13,23 @@ from .geometry import safe_difference, safe_union
def load_greenspace(path: Path) -> tuple[STRtree, list]:
"""Load greenspace parquet and build an STRtree spatial index.
Geometries are repaired with ``make_valid`` on load: an invalid park/lake
polygon would make the per-postcode ``intersects`` predicate (and the exact
difference path) liable to raise mid-merge, hours into a build. Empty
geometries are dropped.
Returns:
(tree, geoms) where tree is a Shapely STRtree and geoms is
the list of geometries indexed by the tree.
"""
df = pl.read_parquet(path)
geoms = [wkb.loads(g) for g in df["geometry"].to_list()]
geoms = []
for raw in df["geometry"].to_list():
geom = wkb.loads(raw)
if not geom.is_valid:
geom = make_valid(geom)
if not geom.is_empty:
geoms.append(geom)
tree = STRtree(geoms)
return tree, geoms

View file

@ -101,6 +101,21 @@ def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
return geojson_dict
def _is_pointlike(geom_bng) -> bool:
"""True if a BNG geometry carries no real extent (tower-block signature).
Near-zero area AND short perimeter together distinguish a collapsed point
from a genuine thin sliver, which still carries length.
"""
try:
return (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
)
except GEOSException:
return False
def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap.
@ -109,15 +124,9 @@ def _rescue_footprint(geom_bng) -> dict | None:
gets a building-scale buffer so it is not reduced to an invisible sub-metre
dot; thin slivers that still carry length keep the minimal buffer.
"""
buffer_m = _MIN_FOOTPRINT_BUFFER_M
try:
if (
geom_bng.area < _POINTLIKE_AREA_M2
and geom_bng.length < _POINTLIKE_PERIMETER_M
):
buffer_m = _POINT_RESCUE_BUFFER_M
except GEOSException:
pass
buffer_m = (
_POINT_RESCUE_BUFFER_M if _is_pointlike(geom_bng) else _MIN_FOOTPRINT_BUFFER_M
)
footprint = _largest_polygonal(geom_bng.buffer(buffer_m))
if footprint is None:
return None
@ -147,6 +156,12 @@ def to_wgs84_geojson(
)
if simplified is None:
simplified = cleaned
if _is_pointlike(simplified):
# A POINTLIKE footprint is rescued to building scale even when it
# would survive snapping: a 0.1-1 m² polygon serializes fine but
# ships as an invisible dot covering a whole tower block.
result = _rescue_footprint(simplified)
else:
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
@ -229,6 +244,10 @@ def merge_fragments(
greenspace_tree: Optional STRtree of park/water polygons.
greenspace_geoms: Optional list of park/water geometries (indexed by tree).
"""
subtract = greenspace_tree is not None and greenspace_geoms is not None
if subtract:
from .greenspace import subtract_greenspace
by_postcode: dict[str, list] = defaultdict(list)
for pc, geom in all_fragments:
by_postcode[pc].append(geom)
@ -256,9 +275,7 @@ def merge_fragments(
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
combined = _fill_holes(combined)
# Subtract parks/water if provided
if greenspace_tree is not None and greenspace_geoms is not None:
from .greenspace import subtract_greenspace
if subtract:
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _keep_polygon_parts(combined)

View file

@ -921,6 +921,49 @@ class TestToWgs84Geojson:
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 > 100, f"point footprint only {area_m2:.1f} m^2"
def test_snappable_pointlike_polygon_still_gets_building_scale_footprint(self):
"""A collapsed-but-snappable footprint (e.g. EC2A 2FJ: 181 properties on
0.86 ) must NOT ship as-is just because it survives precision snapping;
pointlike inputs are rescued to a ~201 disc unconditionally."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
# 0.9m x 0.9m square: area 0.81 m², perimeter 3.6 m — pointlike, yet
# large enough (~8 output-grid cells) to survive the 1e-6 deg snap.
tiny = box(530000, 180000, 530000.9, 180000.9)
from .output import _snap_to_wgs84_geojson
assert _snap_to_wgs84_geojson(tiny) is not None, (
"precondition: this polygon must be snappable, otherwise the test "
"exercises the old snap-fails path instead of the new one"
)
result = to_wgs84_geojson(tiny)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert 150 < area_m2 < 300, (
f"pointlike snappable footprint shipped at {area_m2:.2f} m^2 "
"instead of a building-scale (~201 m^2) disc"
)
def test_normal_polygon_area_unchanged(self):
"""A normal polygon must pass through without rescue inflation."""
import pyproj
from shapely.geometry import shape
from shapely.ops import transform as transform_geometry
to_bng = pyproj.Transformer.from_crs(
"EPSG:4326", "EPSG:27700", always_xy=True
)
poly = box(530000, 180000, 530100, 180100) # 10,000 m²
result = to_wgs84_geojson(poly)
assert result is not None
area_m2 = transform_geometry(to_bng.transform, shape(result)).area
assert area_m2 == pytest.approx(10_000, rel=0.01)
def test_thin_sliver_keeps_minimal_buffer(self):
"""A genuine elongated sliver still carries length, so it is NOT inflated
to building scale only truly pointlike inputs are."""
@ -1132,6 +1175,26 @@ class TestSubtractGreenspace:
# 80% < 90% cap, so subtraction should happen
assert result.area == pytest.approx(2000, rel=0.01)
def test_load_greenspace_repairs_invalid_and_drops_empty(self, tmp_path):
"""An invalid (bow-tie) park polygon in the parquet must be repaired on
load: it would otherwise make the per-postcode intersects/difference
liable to raise hours into a merge."""
from .greenspace import load_greenspace
bowtie = Polygon([(0, 0), (10, 10), (10, 0), (0, 10)]) # self-intersects
assert not bowtie.is_valid
valid = box(20, 20, 30, 30)
path = tmp_path / "greenspace.parquet"
pl.DataFrame({"geometry": [bowtie.wkb, valid.wkb]}).write_parquet(path)
tree, geoms = load_greenspace(path)
assert len(geoms) == 2
assert all(g.is_valid and not g.is_empty for g in geoms)
# The repaired bow-tie must still subtract cleanly.
result = subtract_greenspace(box(0, 0, 100, 100), tree, geoms)
assert result.is_valid
assert result.area < 10_000
class TestToWgs84GeojsonValidity:
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""

View file

@ -26,6 +26,7 @@ from pipeline.transform.price_estimation.shrinkage import (
from pipeline.transform.price_estimation.utils import (
CURRENT_YEAR,
LATEST_COMPLETE_YEAR,
SMOOTHNESS_SUPPORT_PAIRS,
TEMPORAL_SMOOTHNESS_LAMBDA,
TYPE_GROUPS,
build_hedonic_features,
@ -37,6 +38,19 @@ from pipeline.transform.price_estimation.utils import (
MIN_PAIRS = 5
OUTLIER_THRESHOLD = 3.0 # hard pre-filter; Huber handles the rest
# Gap-aware companion to OUTLIER_THRESHOLD: |log_ratio| must also stay within
# this many log-units PER YEAR of holding period (short gaps are allowed a
# full year's band). A flat +/-3.0 cap admits e.g. a 10k -> 196k "sale" six
# months apart (log +2.95, and weight 1/sqrt(gap) gives it the leverage of
# ~10 normal pairs); Huber does NOT recover, because once the thin year's
# beta satisfies the garbage pair it is the many good long-gap pairs that
# carry the residual and get down-weighted. Such pairs are data errors or
# non-market transfers (right-to-buy, probate, flips), not house-price
# signal -- standard repeat-sales practice (Case-Shiller) excludes extreme
# annualised returns for the same reason. 0.7 log/yr (~2x in a year) keeps
# any plausible genuine market move; long-gap pairs are still governed by
# the +/-3.0 cap.
ANNUALISED_OUTLIER_THRESHOLD = 0.7
HUBER_K = 1.345
IRLS_ITERATIONS = 5
@ -111,7 +125,16 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
/ (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
).alias("weight"),
)
.filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
.filter(
pl.col("log_ratio").abs()
<= pl.min_horizontal(
pl.lit(OUTLIER_THRESHOLD),
ANNUALISED_OUTLIER_THRESHOLD
* pl.max_horizontal(
pl.col("frac_year2") - pl.col("frac_year1"), pl.lit(1.0)
),
)
)
.collect()
)
@ -181,11 +204,27 @@ def solve_robust_index(
# beta=0) has no column, so the penalty spans the non-baseline years only.
# For cells with <3 betas there is no curvature to penalise and the solve is
# unchanged.
#
# The penalty is SUPPORT-SCALED per row: a flat lambda is too weak for
# years identified by only 1-2 repeat-sale pairs (a cell can have hundreds
# of pairs overall yet single thin years, yielding 2-7x one-year spikes
# that cell-level shrinkage cannot catch). Each curvature row's lambda is
# lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s), with s the minimum
# cross-year pair count among the row's three years, so thin years are
# pulled strongly toward the local trend while well-supported years keep
# the baseline penalty. Taking the min over the triple (not just the
# middle year) also covers thin FIRST/LAST years of the range, which only
# ever appear at a triple's edge -- the last solved year feeds the
# CURRENT_YEAR trend extrapolation, so spikes there are the costliest.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cross = years1 != years2
touched, counts = np.unique(
np.concatenate([years1[cross], years2[cross]]), return_counts=True
)
support = {int(y): int(c) for y, c in zip(touched, counts)}
years_sorted = sorted(year_to_col)
cols_by_year = [year_to_col[y] for y in years_sorted]
n_pen = n_cols - 2
@ -202,6 +241,11 @@ def solve_robust_index(
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
s_k = min(support.get(y, 0) for y in (y0, y1, y2))
lam_k = TEMPORAL_SMOOTHNESS_LAMBDA * (
1.0 + SMOOTHNESS_SUPPORT_PAIRS / max(s_k, 1)
)
sqrt_lambda = float(np.sqrt(lam_k))
pen_vals[3 * k : 3 * k + 3] = (
sqrt_lambda * w0,
sqrt_lambda * w1,
@ -347,10 +391,22 @@ def compute_hedonic_index(
EXTRAPOLATION_YEARS = 3
# Bound on the per-year slope used to trend-extrapolate beyond the last solved
# year (the solve stops at LATEST_COMPLETE_YEAR; CURRENT_YEAR is filled here).
# +/-0.10 log/yr (~+/-10.5%/yr) comfortably covers genuine UK sector-level
# annual moves while preventing a residual spike in the recent betas from
# compounding into an absurd extrapolated step (e.g. +49% in one year).
MAX_EXTRAPOLATION_SLOPE = 0.10
def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
"""Forward-fill missing years, with linear extrapolation beyond last known year."""
"""Forward-fill missing years, with trend extrapolation beyond last known year.
The extrapolation slope is the MEDIAN of the per-year slopes between
consecutive known points in the recent window (a single noisy year corrupts
at most one of those slopes, unlike a least-squares fit through all the
points), clamped to +/-MAX_EXTRAPOLATION_SLOPE.
"""
if not index:
return {y: 0.0 for y in range(min_year, max_year + 1)}
@ -365,7 +421,7 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
last = index[y]
filled[y] = last
# Linear extrapolation beyond last known year
# Robust trend extrapolation beyond last known year
if last_known_year < max_year:
recent = [
(y, index[y])
@ -373,9 +429,17 @@ def forward_fill(index: dict, min_year: int, max_year: int) -> dict:
if y >= last_known_year - EXTRAPOLATION_YEARS
]
if len(recent) >= 2:
years_arr = np.array([r[0] for r in recent], dtype=np.float64)
vals_arr = np.array([r[1] for r in recent], dtype=np.float64)
slope = np.polyfit(years_arr, vals_arr, 1)[0]
slopes = [
(v_b - v_a) / (y_b - y_a)
for (y_a, v_a), (y_b, v_b) in zip(recent[:-1], recent[1:])
]
slope = float(
np.clip(
np.median(slopes),
-MAX_EXTRAPOLATION_SLOPE,
MAX_EXTRAPOLATION_SLOPE,
)
)
for y in range(last_known_year + 1, max_year + 1):
filled[y] = index[last_known_year] + slope * (y - last_known_year)
else:
@ -389,12 +453,16 @@ def build_index(
input_path: Path,
max_pair_year: int | None = None,
postcodes_path: Path | None = None,
sectors: list[str] | None = None,
) -> pl.DataFrame:
"""Build the full price index from raw data.
If max_pair_year is set, only pairs before that year are used (backtesting holdout).
The index is still forward-filled to CURRENT_YEAR.
postcodes_path: if provided, lat/lon are read from this file instead of input_path.
sectors: if provided, restrict the build to these postcode sectors (for
debugging/verification runs; hierarchy levels are then computed only from
the scoped pairs, so scoped output is NOT identical to a full build).
"""
# Solve the index only on COMPLETE calendar years: exclude the partial
# current year, whose thin repeat-sale set yields wild betas. The index is
@ -405,6 +473,9 @@ def build_index(
max_pair_year if max_pair_year is not None else LATEST_COMPLETE_YEAR + 1
)
pairs = extract_pairs(input_path, max_year2=estimation_cap)
if sectors is not None:
pairs = pairs.filter(pl.col("sector").is_in(sectors))
print(f" Scoped to {len(sectors)} sectors: {len(pairs):,} pairs")
centroids = extract_centroids(postcodes_path or input_path)
min_year = int(pairs["year1"].min())
@ -534,9 +605,21 @@ def main():
help="Path to postcode.parquet (for lat/lon centroids)",
)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument(
"--sectors",
type=str,
default=None,
help="Comma-separated postcode sectors to scope the build to "
"(debug/verification only; hierarchy is computed from scoped pairs)",
)
args = parser.parse_args()
result = build_index(args.input, postcodes_path=args.postcodes)
sectors = (
[s.strip() for s in args.sectors.split(",") if s.strip()]
if args.sectors
else None
)
result = build_index(args.input, postcodes_path=args.postcodes, sectors=sectors)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)

View file

@ -3,7 +3,10 @@ import polars as pl
from pipeline.transform.price_estimation import index as index_mod
from pipeline.transform.price_estimation.index import (
MAX_EXTRAPOLATION_SLOPE,
compute_indices_for_level,
extract_pairs,
forward_fill,
solve_robust_index,
)
@ -105,6 +108,139 @@ def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
assert abs(idx[2015] - true[2015]) < 0.05
def _ramp_pairs_with_thin_tail(tail_ratio: float, tail_n: int, ramp_reps: int):
"""Smooth 0.04/yr ramp 2010-2020 with `ramp_reps` copies of each adjacent
pair, plus `tail_n` pair(s) 2020->2021 asserting a `tail_ratio` jump."""
years = range(2010, 2021)
true = {y: 0.04 * (y - 2010) for y in years}
y1, y2, lr, w = [], [], [], []
for a in range(2010, 2020):
for _ in range(ramp_reps):
y1.append(a)
y2.append(a + 1)
lr.append(true[a + 1] - true[a])
w.append(1.0)
for _ in range(tail_n):
y1.append(2020)
y2.append(2021)
lr.append(tail_ratio)
w.append(1.0)
return (
np.array(y1, dtype=np.int32),
np.array(y2, dtype=np.int32),
np.array(lr, dtype=np.float64),
np.array(w, dtype=np.float64),
)
def test_support_scaled_penalty_suppresses_thin_year_spike(monkeypatch):
"""A final year identified by a SINGLE pair claiming a +1.5 log jump is
pulled strongly toward the local trend; with the flat baseline penalty
(support scaling off) the jump survives almost entirely. The thin year is
the LAST year of the range (only ever at a penalty triple's edge), proving
the min-over-triple support rule covers range edges -- the last solved year
feeds the CURRENT_YEAR trend extrapolation."""
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=1.5, tail_n=1, ramp_reps=10)
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
flat = solve_robust_index(y1, y2, lr, w)
monkeypatch.undo()
scaled = solve_robust_index(y1, y2, lr, w)
flat_step = flat[2021] - flat[2020]
scaled_step = scaled[2021] - scaled[2020]
assert flat_step > 1.2 # flat lambda barely resists the spike
assert scaled_step < 0.65 # support-scaled lambda suppresses it
# The well-supported ramp stays close to truth: the strong penalty row
# spanning the thin year drags its immediate neighbour slightly (<0.1)
# toward collinearity -- the price of suppressing a x4.5 one-year spike.
for y in range(2010, 2021):
assert abs(scaled[y] - 0.04 * (y - 2010)) < 0.1
def test_support_scaling_leaves_well_supported_years_unchanged(monkeypatch):
"""With ample pairs everywhere (support 50-100 per year), lambda_eff ~
lambda0 and the solution matches the flat-penalty solve to <1e-3."""
y1, y2, lr, w = _ramp_pairs_with_thin_tail(tail_ratio=0.04, tail_n=50, ramp_reps=50)
monkeypatch.setattr(index_mod, "SMOOTHNESS_SUPPORT_PAIRS", 0)
flat = solve_robust_index(y1, y2, lr, w)
monkeypatch.undo()
scaled = solve_robust_index(y1, y2, lr, w)
assert set(flat) == set(scaled)
assert max(abs(flat[y] - scaled[y]) for y in flat) < 1e-3
def test_forward_fill_extrapolation_uses_robust_median_slope():
"""A residual spike in ONE recent year must not corrupt the extrapolated
step: the median of consecutive per-year slopes ignores it (a least-squares
fit through the same points would extrapolate a large positive slope)."""
index = {2022: 1.00, 2023: 1.05, 2024: 1.60, 2025: 1.10}
filled = forward_fill(index, 2022, 2026)
# slopes: [+0.05, +0.55, -0.50] -> median +0.05
assert abs(filled[2026] - (1.10 + 0.05)) < 1e-9
def test_forward_fill_extrapolated_slope_is_clamped():
"""A consistent (but absurd) recent trend is clamped to MAX_EXTRAPOLATION_SLOPE."""
index = {2022: 0.0, 2023: 0.4, 2024: 0.8, 2025: 1.2}
filled = forward_fill(index, 2022, 2026)
assert abs(filled[2026] - (1.2 + MAX_EXTRAPOLATION_SLOPE)) < 1e-9
index_down = {2022: 1.2, 2023: 0.8, 2024: 0.4, 2025: 0.0}
filled_down = forward_fill(index_down, 2022, 2026)
assert abs(filled_down[2026] - (0.0 - MAX_EXTRAPOLATION_SLOPE)) < 1e-9
def test_forward_fill_preserves_sane_trend_and_flat_fallback():
"""Genuine moderate trends still extrapolate (it stays a forward-FILL-with-
trend); with <2 recent points the fill is flat."""
index = {2022: 1.00, 2023: 1.05, 2024: 1.10, 2025: 1.15}
filled = forward_fill(index, 2022, 2026)
assert abs(filled[2026] - 1.20) < 1e-9
assert forward_fill({2025: 0.7}, 2024, 2026)[2026] == 0.7
def test_extract_pairs_drops_extreme_annualised_returns(tmp_path):
"""A +-3.0 log cap alone admits e.g. a 10x 'gain' in six months -- a data
error or non-market transfer with huge leverage (weight = 1/sqrt(gap)).
Such pairs are dropped via the annualised cap; large ratios over long
holding periods (genuine appreciation) are kept."""
df = pl.DataFrame(
{
"Postcode": ["AB1 2CD", "AB1 2CE", "AB1 2CF"],
"Property type": ["Detached", "Detached", "Detached"],
"historical_prices": [
# +2.30 log in 6 months -> dropped (cap 0.7 for gap <= 1yr)
[
{"year": 2020, "month": 1, "price": 100_000},
{"year": 2020, "month": 7, "price": 1_000_000},
],
# +2.20 log over 24 years -> kept (flat 3.0 cap governs)
[
{"year": 2000, "month": 1, "price": 100_000},
{"year": 2024, "month": 1, "price": 900_000},
],
# +0.41 log in 1 year -> kept (within the 0.7/yr band)
[
{"year": 2020, "month": 1, "price": 100_000},
{"year": 2021, "month": 1, "price": 150_000},
],
],
}
)
path = tmp_path / "props.parquet"
df.write_parquet(path)
pairs = extract_pairs(path)
assert len(pairs) == 2
ratios = sorted(round(r, 2) for r in pairs["log_ratio"].to_list())
assert ratios == [0.41, 2.2]
def test_n_pairs_counts_only_cross_year_pairs():
"""FIX #12: same-year pairs carry zero index information and must not inflate
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""

View file

@ -36,6 +36,20 @@ SHRINKAGE_K = 50
# noisy year) without flattening genuine multi-year trends.
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
# Per-year support scaling for the temporal smoothness penalty. A flat lambda
# is too weak for years with very few repeat-sale pairs: a sector can have
# hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it)
# yet have individual years estimated from 1-2 pairs, producing 2-7x
# single-year index spikes. Each curvature row is therefore scaled by the
# local pair support of its year triple:
# lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s)
# where s is the minimum cross-year pair count among the triple's years.
# Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~
# lambda0 (current behaviour); a year identified by a single pair gets
# ~41x lambda0, pulling its beta strongly toward the local trend through its
# neighbours. Same-year pairs cancel in the design and are not counted.
SMOOTHNESS_SUPPORT_PAIRS = 40
def type_group_expr():
"""Polars expression: Property type -> type_group."""

View file

@ -0,0 +1,748 @@
"""Model historical school catchment areas and count them per postcode.
No national dataset of school catchment areas exists for England: catchments
are set per admission authority, only a handful of councils publish polygons,
and the pupil-residence data behind commercial "heatmap" catchments lives in
the restricted National Pupil Database. This module therefore COMPILES one
from open data, estimating each school's admission cutoff distance ("last
distance offered") — the radius within which an applicant would plausibly be
offered a place.
Model: English state admissions are run as deferred acceptance with distance
tie-breaks, which in a continuum economy is equivalent to finding
market-clearing cutoff distances (Azevedo & Leshno 2016). Per phase
(primary/secondary):
1. Demand Census 2021 children per LSOA (TS007A age bands, prorated to the
phase's cohort ages) split evenly across the LSOA's live postcodes.
2. Supply every open, non-selective state-funded school (GIAS), with a fill
target of max(capacity, headcount) prorated to the phase's cohorts
(sixth-form and nursery years carry reduced weight, since their class
sizes differ and they are not allocated by the same admissions round).
3. Preferences children prefer nearby schools, trading distance against
Ofsted grade: a school's effective distance is its real distance minus a
grade bonus (Outstanding > Good > ungraded > below-Good). Because real
first preferences are heterogeneous, each postcode's children split
across nearby feasible schools with logit weights over effective
distance rather than all picking the same one.
4. Equilibrium cutoffs start unbounded and tighten monotonically: each
round, children apply to their preferred feasible school(s), and
oversubscribed schools tighten their cutoff to the distance of their
marginal admitted child. Converges to the deferred-acceptance outcome.
5. Schools that never fill have no binding cutoff anyone who applies gets
in so their feasibility radius is the distance within which the local
child population would cover their fill target, capped.
The free parameters (preference bonuses, demand scale, choice temperature,
residual calibration factors) are CALIBRATED against published "last
distance offered" figures scraped from nine local authorities' allocation
reports see check_school_cutoffs.py and the constants below.
A postcode is "inside the catchment" of every school whose cutoff radius
covers it. The output counts those schools per postcode for the four
good+/outstanding x primary/secondary categories (Ofsted-classified, same
rules as the previous proximity metric). Selective (grammar) schools are
excluded throughout: their intakes are test-based and region-wide, so a
distance model would fabricate a catchment that does not exist.
Known limitations: faith oversubscription criteria are not modelled (whether
a faith school's catchment is open to a given family depends on the family),
and Census 2021 child counts lag current rolls slightly. Cutoffs are
straight-line distances, the modal LA tie-break criterion.
"""
import argparse
from pathlib import Path
import numpy as np
import polars as pl
from scipy.spatial import cKDTree
from pipeline.utils.poi_counts import _project_lat_lng_km, valid_uk_coords_mask
SCHOOL_GROUPS = {
"good_primary": ["good_primary", "outstanding_primary"],
"good_secondary": ["good_secondary", "outstanding_secondary"],
"outstanding_primary": ["outstanding_primary"],
"outstanding_secondary": ["outstanding_secondary"],
}
# Age thresholds for deciding which phase(s) a school serves. A school serves
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
# both the primary and the secondary metrics — Ofsted's coarse "Ofsted phase"
# labels such schools as just "Secondary", which previously hid them from every
# postcode's primary-school count.
PRIMARY_MAX_AGE = 10
SECONDARY_MIN_AGE = 12
# Cohort ages (inclusive) each phase competes for: Reception-Y6 and Y7-Y11.
PRIMARY_AGES = (4, 10)
SECONDARY_AGES = (11, 15)
# Cohort weights for prorating a school's headcount/capacity across the ages
# it teaches. Nursery classes are typically part-time and small; sixth forms
# run at roughly 60% of a school's Y7-Y11 cohort size. A flat proration
# undersupplied secondary places by ~8%.
NURSERY_COHORT_WEIGHT = 0.5 # ages < 4
SIXTH_FORM_COHORT_WEIGHT = 0.6 # ages >= 16
# Only schools that admit (mostly) by geography take part in the assignment.
# Independent, special and Welsh schools and post-16 colleges either don't
# admit by distance or fall outside the England postcode universe; selective
# (grammar) schools admit by test from a wide region.
STATE_SCHOOL_TYPE_GROUPS = [
"Academies",
"Local authority maintained schools",
"Free Schools",
]
# Preference bonuses (km of extra travel a family accepts for a better
# school), applied as a discount on effective distance when children choose.
# Grade 3/4 schools repel by the same magnitudes.
PREF_BONUS_OUTSTANDING_KM = 0.6
PREF_BONUS_GOOD_KM = 0.3
# Share of resident children who actually compete for state places. Census
# 2021 counts overstate current entry cohorts (birth rates fell ~10% between
# 2016 and 2021, which is exactly the gap between the census stock and the
# children reaching Reception by mid-decade) and independent/home-educated
# children (~7%) never enter the allocation at all. Without this, modelled
# cutoffs run systematically tight and undersubscribed schools look full.
DEMAND_SCALE = 0.8
# Logit choice temperature (km). With deterministic choice every child at a
# postcode ranks the same school first, so popular schools fill entirely from
# their nearest band and the marginal admitted child sits unrealistically
# close. Real first preferences are heterogeneous; a school draws only a
# distance-decaying share of nearby families. Children therefore split across
# nearby feasible schools with weights softmax(-effective_distance / tau):
# higher tau = more smearing = wider cutoffs. tau -> 0 recovers the
# deterministic model (used by the unit tests). Calibrated 2026-06 against
# 240 published binding cutoffs from 9 LAs (check_school_cutoffs.py): 0.3 km
# maximises rank correlation and within-2x share; beyond ~0.6 the smearing
# erases school-to-school differentiation (Spearman 0.24 -> 0.01).
CHOICE_TEMPERATURE_KM = 0.3
# Residual calibration from the same ground truth: after the equilibrium
# solve, modelled cutoffs still ran systematically tight (median log2 bias
# -0.53 primary / -0.36 secondary at the settings above — published "last
# distance offered" reflects offer-day frictions, waiting-list churn and
# furthest-applicant noise that no clean equilibrium reproduces). Radii are
# multiplied by 2^-bias so the modelled median matches the published median;
# rank ordering is unaffected.
CUTOFF_CALIBRATION_FACTOR = {"primary": 1.44, "secondary": 1.28}
# Each demand postcode considers this many nearest schools; beyond ~16
# candidates assignment shares are negligible.
NEAREST_SCHOOL_CANDIDATES = 16
# Radius guard rails: the floor absorbs postcode-centroid noise around tiny
# urban catchments; the cap bounds feasibility radii for schools the model
# never fills (mostly rural).
MIN_RADIUS_KM = 0.3
MAX_RADIUS_KM = 25.0
EQUILIBRIUM_MAX_ITER = 100
def classify_good_plus_schools(
ofsted: pl.DataFrame, open_urns: set[int] | None = None
) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for catchment counts.
Derives a grade ("1" = outstanding, "2" = good) and one or two
``category`` rows per school, returning a ``(urn, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding"). Filtering on the graded column
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
ungraded outcome, but ONLY when there is no usable graded result
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
(Concerns)" outcome signals inspectors found issues warranting an earlier
graded re-inspection, so marketing it as a good+ school is misleading.
Phase assignment uses the statutory age range when available (so all-through
and middle schools count toward BOTH primary and secondary), falling back to
the coarse "Ofsted phase" label when age columns are absent. When
``open_urns`` is given, schools whose URN is not in the current GIAS open
register are dropped so closed/merged schools are not counted.
"""
graded = _with_derived_grade(ofsted).filter(
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("_ofsted_grade").is_in(["1", "2"])
)
# Drop schools no longer open (closed/merged) when the GIAS open register is
# provided, so stale Ofsted "latest inspection" rows are not counted.
if open_urns is not None and "URN" in graded.columns:
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
# Decide which phase(s) each school serves.
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
serves_primary = (
pl.when(low.is_not_null())
.then(low <= PRIMARY_MAX_AGE)
.otherwise(pl.col("Ofsted phase") == "Primary")
)
serves_secondary = (
pl.when(high.is_not_null())
.then(high >= SECONDARY_MIN_AGE)
.otherwise(pl.col("Ofsted phase") == "Secondary")
)
else:
serves_primary = pl.col("Ofsted phase") == "Primary"
serves_secondary = pl.col("Ofsted phase") == "Secondary"
graded = graded.with_columns(
serves_primary.alias("_serves_primary"),
serves_secondary.alias("_serves_secondary"),
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
# A school can yield up to two rows (primary and secondary).
primary = graded.filter(pl.col("_serves_primary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
.alias("category")
)
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
)
return pl.concat([primary, secondary]).select(
pl.col("URN").cast(pl.Int64).alias("urn"),
"category",
)
def _with_derived_grade(ofsted: pl.DataFrame) -> pl.DataFrame:
"""Attach ``_ofsted_grade`` ("1"-"4" or null): graded OEIF result first,
falling back to ungraded "School remains Good/Outstanding" outcomes (minus
"(Concerns)") only when there is no usable graded result."""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
has_concern = ungraded.str.contains(r"\(Concerns\)")
remains_outstanding = (
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
)
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
return ofsted.with_columns(
pl.when(oeif.is_in(["1", "2", "3", "4"]))
.then(oeif)
.when(no_usable_grade & remains_outstanding)
.then(pl.lit("1"))
.when(no_usable_grade & remains_good)
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
def school_preference_bonuses(
ofsted: pl.DataFrame,
bonus_outstanding_km: float = PREF_BONUS_OUTSTANDING_KM,
bonus_good_km: float = PREF_BONUS_GOOD_KM,
) -> pl.DataFrame:
"""Per-school preference bonus in km, from the derived Ofsted grade.
Outstanding/Good schools attract demand from further away; grade 3/4
schools repel it symmetrically. Ungraded (typically new) schools are
neutral. Returns ``(urn, bonus_km)`` with one row per URN.
"""
bonus = {
"1": bonus_outstanding_km,
"2": bonus_good_km,
"3": -bonus_good_km,
"4": -bonus_outstanding_km,
}
return (
_with_derived_grade(ofsted)
.filter(pl.col("URN").is_not_null())
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
pl.col("_ofsted_grade")
.replace_strict(bonus, default=0.0, return_dtype=pl.Float64)
.alias("bonus_km"),
)
.unique(subset="urn", keep="first")
)
def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
"""Per-school phase-prorated fill targets for the admissions model.
Returns one row per open, non-selective state-funded school with valid
coordinates: ``(urn, lat, lng, primary_intake, secondary_intake)``. The
fill target max(capacity, headcount), so over-full schools keep their
demonstrated size and under-full schools can admit up to capacity is
spread over the cohort ages the school teaches (parsed from ``age_range``,
e.g. "311" = ages 3..10) with nursery and sixth-form ages down-weighted,
and each phase receives the share of cohort weight in its age band.
"""
ages = pl.col("age_range").str.extract_all(r"\d+")
low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
# The leaving age is exclusive as a cohort: a "3-11" school teaches
# children aged 3 through 10.
high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
schools = (
gias.filter(
pl.col("type_group").is_in(STATE_SCHOOL_TYPE_GROUPS)
& (
pl.col("admissions_policy").is_null()
| (pl.col("admissions_policy") != "Selective")
)
& pl.col("lat").is_not_null()
& pl.col("lng").is_not_null()
)
.with_columns(low.alias("_low"), high.alias("_high"))
.filter(pl.col("_low").is_not_null() & (pl.col("_high") >= pl.col("_low")))
.with_columns(
pl.max_horizontal(
pl.col("pupils").fill_null(0), pl.col("capacity").fill_null(0)
)
.cast(pl.Float64)
.alias("_fill_target"),
)
.filter(pl.col("_fill_target") > 0)
)
def weighted_overlap(lo: int, hi: int, weight: float = 1.0) -> pl.Expr:
"""Cohort weight contributed by ages [lo, hi] within [_low, _high]."""
return (
weight
* (
pl.min_horizontal(pl.col("_high"), hi)
- pl.max_horizontal(pl.col("_low"), lo)
+ 1
).clip(lower_bound=0)
).cast(pl.Float64)
total_weight = (
weighted_overlap(0, 3, NURSERY_COHORT_WEIGHT)
+ weighted_overlap(4, 15)
+ weighted_overlap(16, 30, SIXTH_FORM_COHORT_WEIGHT)
)
return schools.select(
pl.col("urn").cast(pl.Int64),
"lat",
"lng",
(pl.col("_fill_target") * weighted_overlap(*PRIMARY_AGES) / total_weight).alias(
"primary_intake"
),
(
pl.col("_fill_target") * weighted_overlap(*SECONDARY_AGES) / total_weight
).alias("secondary_intake"),
)
def children_per_postcode(
postcodes: pl.DataFrame, lsoa_children: pl.DataFrame
) -> pl.DataFrame:
"""Estimate phase-age children living at each live postcode.
Census age bands don't align with school phases, so phase totals take
fractional shares of bands (one fifth per single year of age): primary
(4-10) = age 4 + ages 5-9 + age 10, secondary (11-15) = ages 11-14 +
age 15. LSOA totals are then split evenly across the LSOA's postcodes.
"""
lsoa = lsoa_children.select(
"lsoa21",
(
0.2 * pl.col("aged_0_4") + pl.col("aged_5_9") + 0.2 * pl.col("aged_10_14")
).alias("_lsoa_primary"),
(0.8 * pl.col("aged_10_14") + 0.2 * pl.col("aged_15_19")).alias(
"_lsoa_secondary"
),
)
return (
postcodes.join(lsoa, left_on="lsoa21cd", right_on="lsoa21", how="inner")
.with_columns(pl.len().over("lsoa21cd").alias("_lsoa_postcodes"))
.select(
"postcode",
"lat",
"lng",
(pl.col("_lsoa_primary") / pl.col("_lsoa_postcodes")).alias(
"primary_children"
),
(pl.col("_lsoa_secondary") / pl.col("_lsoa_postcodes")).alias(
"secondary_children"
),
)
)
def equilibrium_cutoffs(
school_xy: np.ndarray,
fill_target: np.ndarray,
bonus_km: np.ndarray,
pc_xy: np.ndarray,
pc_children: np.ndarray,
k: int = NEAREST_SCHOOL_CANDIDATES,
max_iter: int = EQUILIBRIUM_MAX_ITER,
tau_km: float = CHOICE_TEMPERATURE_KM,
) -> np.ndarray:
"""Market-clearing admission cutoff distance (km) per school.
Deferred acceptance with distance priority, solved as cutoff dynamics
(Azevedo & Leshno): cutoffs start unbounded; each round every child unit
applies to its preferred feasible school(s) a logit split over
effective distance (distance - school bonus) among schools whose cutoff
covers it, collapsing to the single best school when ``tau_km`` is 0
and each oversubscribed school tightens its cutoff to its marginal
admitted child's distance. Cutoffs only ever tighten, so the iteration
converges.
Returns np.inf for schools that never fill (no binding cutoff).
"""
n_schools = len(school_xy)
k = min(k, n_schools)
demand = np.flatnonzero(pc_children > 0)
weights = pc_children[demand]
tree = cKDTree(school_xy)
dist, cand = tree.query(pc_xy[demand], k=k, workers=-1)
if k == 1:
dist = dist[:, None]
cand = cand[:, None]
eff = dist - bonus_km[cand]
rows = np.arange(len(demand))
cutoff = np.full(n_schools, np.inf)
for _ in range(max_iter):
eff_feasible = np.where(dist <= cutoff[cand], eff, np.inf)
if tau_km <= 0:
choice = np.argmin(eff_feasible, axis=1)
valid = np.isfinite(eff_feasible[rows, choice])
chosen_school = cand[rows[valid], choice[valid]]
chosen_dist = dist[rows[valid], choice[valid]]
chosen_mass = weights[valid]
else:
z = -eff_feasible / tau_km
z_max = z.max(axis=1, keepdims=True)
share = np.exp(z - np.where(np.isfinite(z_max), z_max, 0.0))
share[~np.isfinite(eff_feasible)] = 0.0
total = share.sum(axis=1, keepdims=True)
mass = weights[:, None] * share / np.where(total > 0, total, 1.0)
# Sub-thousandth-of-a-child applications only slow the sort down.
keep = mass > 1e-3
chosen_school = cand[keep]
chosen_dist = dist[keep]
chosen_mass = mass[keep]
order = np.lexsort((chosen_dist, chosen_school))
s_sorted = chosen_school[order]
d_sorted = chosen_dist[order]
m_cum = np.cumsum(chosen_mass[order])
boundaries = np.flatnonzero(np.diff(s_sorted)) + 1
starts = np.concatenate(([0], boundaries))
ends = np.concatenate((boundaries, [len(s_sorted)]))
changed = False
for start, end in zip(starts, ends):
school = s_sorted[start]
seg_cum = m_cum[start:end] - (m_cum[start - 1] if start else 0.0)
if seg_cum[-1] <= fill_target[school]:
continue
marginal = d_sorted[start + np.searchsorted(seg_cum, fill_target[school])]
if marginal < cutoff[school]:
cutoff[school] = marginal
changed = True
if not changed:
break
return cutoff
def capacity_fill_radii(
school_xy: np.ndarray,
fill_target: np.ndarray,
pc_xy: np.ndarray,
pc_children: np.ndarray,
max_radius_km: float = MAX_RADIUS_KM,
) -> np.ndarray:
"""Feasibility radius for schools without a binding cutoff.
An undersubscribed school admits anyone who applies, so its catchment is
bounded by plausibility rather than competition: the distance within
which the local child population would cover its fill target. Capped at
``max_radius_km``.
"""
demand = np.flatnonzero(pc_children > 0)
tree = cKDTree(pc_xy[demand])
radii = np.full(len(school_xy), max_radius_km)
k = min(4096, len(demand))
for i in range(len(school_xy)):
dists, idx = tree.query(
school_xy[i], k=k, distance_upper_bound=max_radius_km
)
found = np.isfinite(dists)
cum = np.cumsum(pc_children[demand[idx[found]]])
if len(cum) and cum[-1] >= fill_target[i]:
radii[i] = dists[found][np.searchsorted(cum, fill_target[i])]
return radii
def count_covering_catchments(
pc_xy: np.ndarray,
pc_valid: np.ndarray,
school_xy: np.ndarray,
school_radii: np.ndarray,
n_postcodes: int,
) -> np.ndarray:
"""Count, per postcode, how many schools' catchment radii cover it."""
counts = np.zeros(n_postcodes, dtype=np.int32)
if len(school_xy) == 0:
return counts
valid_indices = np.flatnonzero(pc_valid)
tree = cKDTree(pc_xy[valid_indices])
covered = np.zeros(len(valid_indices), dtype=np.int32)
for indices in tree.query_ball_point(school_xy, school_radii, workers=-1):
covered[indices] += 1
counts[valid_indices] = covered
return counts
def main():
parser = argparse.ArgumentParser(
description=(
"Model school admission cutoff radii and count good+/outstanding "
"primary/secondary catchments covering each postcode"
)
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--gias", type=Path, required=True, help="GIAS open-school parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--lsoa-children",
type=Path,
required=True,
help="Census 2021 children by LSOA parquet",
)
parser.add_argument(
"--output",
type=Path,
default=None,
help="Per-postcode counts parquet; omit for calibration runs that only "
"need --schools-output",
)
parser.add_argument(
"--schools-output",
type=Path,
default=None,
help="Optional per-school catchment radii parquet (for calibration/debugging)",
)
parser.add_argument(
"--bonus-outstanding-km",
type=float,
default=PREF_BONUS_OUTSTANDING_KM,
help="Preference bonus for Outstanding schools (calibration sweeps)",
)
parser.add_argument(
"--bonus-good-km",
type=float,
default=PREF_BONUS_GOOD_KM,
help="Preference bonus for Good schools (calibration sweeps)",
)
parser.add_argument(
"--demand-scale",
type=float,
default=DEMAND_SCALE,
help="Share of resident children competing for state places",
)
parser.add_argument(
"--choice-temperature-km",
type=float,
default=CHOICE_TEMPERATURE_KM,
help="Logit choice temperature over effective distance",
)
args = parser.parse_args()
gias = pl.read_parquet(args.gias)
open_urns = set(
gias.select(pl.col("urn").cast(pl.Int64, strict=False))
.to_series()
.drop_nulls()
.to_list()
)
print(f"GIAS open register: {len(open_urns):,} open school URNs")
ofsted = pl.read_parquet(args.ofsted)
rated = classify_good_plus_schools(ofsted, open_urns=open_urns)
if rated.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ school/phase rows: {len(rated):,}")
supply = phase_intakes(gias).join(
school_preference_bonuses(
ofsted,
bonus_outstanding_km=args.bonus_outstanding_km,
bonus_good_km=args.bonus_good_km,
),
on="urn",
how="left",
).with_columns(pl.col("bonus_km").fill_null(0.0))
print(f"State schools in admissions model: {len(supply):,}")
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
"lsoa21cd",
"doterm",
)
live = arcgis.filter(
pl.col("doterm").is_null() & pl.col("lsoa21cd").str.starts_with("E")
)
demand = children_per_postcode(live, pl.read_parquet(args.lsoa_children))
print(
f"Demand postcodes: {len(demand):,} "
f"({demand['primary_children'].sum():,.0f} primary-age, "
f"{demand['secondary_children'].sum():,.0f} secondary-age children)"
)
# Shared local-km projection so assignment and coverage use one metric.
pc_lats = arcgis["lat"].to_numpy()
pc_lngs = arcgis["lng"].to_numpy()
pc_valid = valid_uk_coords_mask(pc_lats, pc_lngs)
origin_lat = float(np.mean(pc_lats[pc_valid]))
pc_xy = _project_lat_lng_km(pc_lats, pc_lngs, origin_lat)
demand_lats = demand["lat"].to_numpy()
demand_lngs = demand["lng"].to_numpy()
demand_valid = valid_uk_coords_mask(demand_lats, demand_lngs)
demand_xy = _project_lat_lng_km(demand_lats, demand_lngs, origin_lat)
school_xy = _project_lat_lng_km(
supply["lat"].to_numpy(), supply["lng"].to_numpy(), origin_lat
)
radii = {}
for phase in ("primary", "secondary"):
in_phase = supply[f"{phase}_intake"].to_numpy() > 0
targets = supply[f"{phase}_intake"].to_numpy()[in_phase]
xy = school_xy[in_phase]
children = np.where(
demand_valid,
demand[f"{phase}_children"].to_numpy() * args.demand_scale,
0.0,
)
print(f"Solving {phase} admissions for {in_phase.sum():,} schools...")
cutoffs = equilibrium_cutoffs(
xy,
targets,
supply["bonus_km"].to_numpy()[in_phase],
demand_xy,
children,
tau_km=args.choice_temperature_km,
)
filled = np.isfinite(cutoffs)
print(
f" {filled.sum():,} schools have binding cutoffs "
f"(median {np.median(cutoffs[filled]):.2f} km); "
f"{(~filled).sum():,} undersubscribed"
)
fallback = capacity_fill_radii(
xy[~filled], targets[~filled], demand_xy, children
)
raw = cutoffs.copy()
raw[~filled] = fallback
radii[phase] = pl.DataFrame(
{
"urn": supply["urn"].to_numpy()[in_phase],
"phase": phase,
"cutoff_km": raw,
"filled": filled,
"radius_km": np.clip(
raw * CUTOFF_CALIBRATION_FACTOR[phase],
MIN_RADIUS_KM,
MAX_RADIUS_KM,
),
}
)
print(
f" radius km: median {radii[phase]['radius_km'].median():.2f}, "
f"p90 {radii[phase]['radius_km'].quantile(0.9):.2f}"
)
# Attach each rated school's phase radius; rated schools outside the
# admissions model (special schools, selective schools, missing
# headcounts) cannot be given a defensible radius and are dropped.
rated = rated.with_columns(
pl.col("category").str.split("_").list.get(1).alias("phase")
)
rated_with_radius = rated.join(
pl.concat(list(radii.values())), on=["urn", "phase"], how="inner"
).join(supply.select("urn", "lat", "lng"), on="urn", how="inner")
dropped = len(rated) - len(rated_with_radius)
print(
f"Rated school/phase rows with radii: {len(rated_with_radius):,} "
f"(dropped {dropped:,}, incl. selective schools)"
)
if args.output is None and args.schools_output is None:
raise SystemExit("Provide --output and/or --schools-output")
if args.output is not None:
category_counts = {}
for category in set(c for cats in SCHOOL_GROUPS.values() for c in cats):
cat = rated_with_radius.filter(pl.col("category") == category)
cat_xy = _project_lat_lng_km(
cat["lat"].to_numpy(), cat["lng"].to_numpy(), origin_lat
)
category_counts[category] = count_covering_catchments(
pc_xy, pc_valid, cat_xy, cat["radius_km"].to_numpy(), len(arcgis)
)
print(f" {category}: {len(cat):,} schools")
result = pl.DataFrame(
{
"postcode": arcgis["postcode"],
**{
f"{group}_catchments": sum(category_counts[c] for c in categories)
for group, categories in SCHOOL_GROUPS.items()
},
}
)
for group in SCHOOL_GROUPS:
col = result[f"{group}_catchments"]
print(f" {group}_catchments: mean {col.mean():.2f}, max {col.max()}")
args.output.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if args.schools_output is not None:
schools_out = rated_with_radius.select(
"urn", "category", "phase", "cutoff_km", "filled", "radius_km", "lat", "lng"
)
args.schools_output.parent.mkdir(parents=True, exist_ok=True)
schools_out.write_parquet(args.schools_output)
print(f"Wrote {args.schools_output}")
if __name__ == "__main__":
main()

View file

@ -1,199 +0,0 @@
"""Compute Ofsted-rated school proximity counts per postcode."""
import argparse
from pathlib import Path
import polars as pl
from pipeline.utils.poi_counts import count_pois_per_postcode
SCHOOL_GROUPS = {
"good_primary": ["good_primary", "outstanding_primary"],
"good_secondary": ["good_secondary", "outstanding_secondary"],
"outstanding_primary": ["outstanding_primary"],
"outstanding_secondary": ["outstanding_secondary"],
}
# Age thresholds for deciding which phase(s) a school serves. A school serves
# PRIMARY-age children if its statutory lowest age is <= 10, and SECONDARY-age
# children if its statutory highest age is >= 12. All-through (e.g. 3-18) and
# middle-deemed-secondary (e.g. 9-13) schools satisfy BOTH and so are counted in
# both the primary and the secondary proximity metrics — Ofsted's coarse "Ofsted
# phase" labels such schools as just "Secondary", which previously hid them from
# every postcode's primary-school count.
PRIMARY_MAX_AGE = 10
SECONDARY_MIN_AGE = 12
def classify_good_plus_schools(
ofsted: pl.DataFrame, open_urns: set[int] | None = None
) -> pl.DataFrame:
"""Label good+/outstanding primary & secondary schools for proximity counts.
Derives a grade ("1" = outstanding, "2" = good) and one or two proximity
``category`` rows per school, returning a ``(postcode, category)`` frame.
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
Framework). A large and growing share of schools were last inspected under an
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
that column is null/"Not judged" for them even when they are demonstrably
good their status lives in "Ungraded inspection overall outcome" ("School
remains Good"/"School remains Outstanding"). Filtering on the graded column
alone dropped ~7,000 genuinely good/outstanding schools. We fall back to the
ungraded outcome, but ONLY when there is no usable graded result
(null/"Not judged"), so a genuine grade 3/4 is never overridden.
Outcomes flagged "(Concerns)" are NOT treated as good+: a "remains Good
(Concerns)" outcome signals inspectors found issues warranting an earlier
graded re-inspection, so marketing it as a good+ school is misleading.
Phase assignment uses the statutory age range when available (so all-through
and middle schools count toward BOTH primary and secondary), falling back to
the coarse "Ofsted phase" label when age columns are absent. When
``open_urns`` is given, schools whose URN is not in the current GIAS open
register are dropped so closed/merged schools are not counted.
"""
# Cast to Utf8 so the string predicates below are well-defined even if a
# column happens to be entirely null (read back as a Null dtype).
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
has_concern = ungraded.str.contains(r"\(Concerns\)")
remains_outstanding = (
ungraded.str.starts_with("School remains Outstanding") & ~has_concern
)
remains_good = ungraded.str.starts_with("School remains Good") & ~has_concern
graded = (
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
.with_columns(
pl.when(oeif.is_in(["1", "2"]))
.then(oeif)
.when(no_usable_grade & remains_outstanding)
.then(pl.lit("1"))
.when(no_usable_grade & remains_good)
.then(pl.lit("2"))
.otherwise(None)
.alias("_ofsted_grade")
)
.filter(pl.col("_ofsted_grade").is_not_null())
)
# Drop schools no longer open (closed/merged) when the GIAS open register is
# provided, so stale Ofsted "latest inspection" rows are not counted.
if open_urns is not None and "URN" in graded.columns:
graded = graded.filter(pl.col("URN").is_in(list(open_urns)))
# Decide which phase(s) each school serves.
if {"Statutory lowest age", "Statutory highest age"} <= set(graded.columns):
low = pl.col("Statutory lowest age").cast(pl.Int64, strict=False)
high = pl.col("Statutory highest age").cast(pl.Int64, strict=False)
serves_primary = (
pl.when(low.is_not_null())
.then(low <= PRIMARY_MAX_AGE)
.otherwise(pl.col("Ofsted phase") == "Primary")
)
serves_secondary = (
pl.when(high.is_not_null())
.then(high >= SECONDARY_MIN_AGE)
.otherwise(pl.col("Ofsted phase") == "Secondary")
)
else:
serves_primary = pl.col("Ofsted phase") == "Primary"
serves_secondary = pl.col("Ofsted phase") == "Secondary"
graded = graded.with_columns(
serves_primary.alias("_serves_primary"),
serves_secondary.alias("_serves_secondary"),
)
# Good+ groups include both grade variants; outstanding groups count grade 1.
# A school can yield up to two rows (primary and secondary).
primary = graded.filter(pl.col("_serves_primary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_primary"))
.otherwise(pl.lit("good_primary"))
.alias("category")
)
secondary = graded.filter(pl.col("_serves_secondary")).with_columns(
pl.when(pl.col("_ofsted_grade") == "1")
.then(pl.lit("outstanding_secondary"))
.otherwise(pl.lit("good_secondary"))
.alias("category")
)
return pl.concat([primary, secondary]).select(
pl.col("Postcode").alias("postcode"),
"category",
)
def main():
parser = argparse.ArgumentParser(
description="Count good+ and outstanding primary/secondary schools near each postcode"
)
parser.add_argument(
"--ofsted", type=Path, required=True, help="Ofsted inspection parquet"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode parquet"
)
parser.add_argument(
"--gias",
type=Path,
default=None,
help="GIAS open-school parquet; if given, only currently-open schools are counted",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet path"
)
args = parser.parse_args()
open_urns: set[int] | None = None
if args.gias is not None:
gias_urns = pl.read_parquet(args.gias).select("urn").to_series().drop_nulls()
open_urns = set(gias_urns.cast(pl.Int64, strict=False).to_list())
print(f"GIAS open register: {len(open_urns):,} open school URNs")
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted), open_urns=open_urns)
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ schools: {len(ofsted):,}")
print(
"Outstanding schools: "
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
)
# Join with arcgis to get lat/lng for each school's postcode
arcgis = pl.read_parquet(args.arcgis).select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lng"),
)
schools = ofsted.join(arcgis, on="postcode", how="inner")
if schools.is_empty():
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
print(f"Schools with coordinates: {len(schools):,}")
# Load all postcodes for proximity counting
postcodes = arcgis.rename({"lng": "lon"})
counts_5km = count_pois_per_postcode(
postcodes, schools, radius_km=5, groups=SCHOOL_GROUPS
)
counts_2km = count_pois_per_postcode(
postcodes, schools, radius_km=2, groups=SCHOOL_GROUPS
)
result = counts_5km.join(counts_2km, on="postcode")
args.output.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()

View file

@ -8,6 +8,7 @@ import polars as pl
from pipeline.transform.join_epc_pp import (
EPC_SOURCE_COLUMNS,
_join_address_parts,
_run,
_scan_epc_certificates,
)
@ -111,6 +112,89 @@ def test_scan_epc_certificates_supports_domestic_zip(tmp_path: Path):
assert df.schema["number_habitable_rooms"] == pl.Int16
def test_join_address_parts_empty_string_components():
# Price-paid SAON/PAON/STREET are empty strings (not null) when absent;
# concat_str(ignore_nulls=True) alone leaked the separator into the
# display address (' 10 PALACE GREEN') and doubled it for empty middle
# components. Empty/whitespace-only parts must contribute nothing.
df = pl.DataFrame(
{
"saon": ["", "FLAT 1", "FLAT 1", "FLAT 21", "", None, " ", " FLAT 2"],
"paon": ["10", "10", "", "82", "", None, "10", "11 "],
"street": [
"PALACE GREEN",
"HIGH STREET",
"HIGH STREET",
"",
"",
None,
"PALACE GREEN",
"STATION ROAD",
],
}
)
out = df.select(
_join_address_parts("saon", "paon", "street").alias("address")
).get_column("address")
assert out.to_list() == [
"10 PALACE GREEN", # empty saon -> no leading space
"FLAT 1 10 HIGH STREET", # normal three-part address is unchanged
"FLAT 1 HIGH STREET", # empty middle component -> no double space
"FLAT 21 82", # empty street -> no trailing space
None, # all-empty -> null, not whitespace junk
None, # all-null -> null
"10 PALACE GREEN", # whitespace-only component treated as empty
"FLAT 2 11 STATION ROAD", # per-component padding is stripped
]
# Invariant: every produced address is trimmed and single-spaced.
produced = out.drop_nulls()
assert produced.str.starts_with(" ").sum() == 0
assert produced.str.ends_with(" ").sum() == 0
assert produced.str.contains(" ", literal=True).sum() == 0
def test_run_builds_clean_pp_address_from_empty_string_saon(tmp_path: Path):
# Real price-paid rows carry saon == "" (not null) on ~88% of rows; the
# published pp_address must not inherit a leading separator from it.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [""],
"street": ["Example Street"],
"locality": [""],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# No leading space, and the clean address still matches its EPC record.
assert df.select("pp_address", "epc_address").to_dicts() == [
{"pp_address": "1 Example Street", "epc_address": "1 Example Street"}
]
def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:

View file

@ -304,7 +304,7 @@ def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(
@ -362,7 +362,7 @@ def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=broadband,
@ -1057,7 +1057,7 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(

View file

@ -1,9 +1,11 @@
import polars as pl
from pipeline.transform.poi_proximity import (
GREENSPACE_PARK_FUNCTIONS,
POI_GROUPS_2KM,
_build_poi_category_groups,
_dynamic_poi_metric_renames,
_greenspace_count_frame,
_groceries_categories,
)
from pipeline.utils.poi_counts import count_pois_per_postcode
@ -88,3 +90,84 @@ def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
"parks_2km": "Number of amenities (Park) within 2km",
"parks_5km": "Number of amenities (Park) within 5km",
}
def test_groceries_categories_exclude_speciality_food_retail() -> None:
"""The static groceries metric must not count bakeries/butchers/delis/
off-licences (speciality retail, ~a third of the group), while keeping
Supermarket, Convenience Store, Greengrocer and GEOLYTIX brands."""
pois = pl.DataFrame(
{
"category": [
"Tesco",
"Supermarket",
"Convenience Store",
"Greengrocer",
"Bakery",
"Butcher & Fishmonger",
"Deli & Specialty",
"Off-Licence",
"Café",
],
"group": ["Groceries"] * 8 + ["Leisure"],
"lat": [51.5] * 9,
"lng": [-0.1] * 9,
}
)
assert _groceries_categories(pois) == [
"Convenience Store",
"Greengrocer",
"Supermarket",
"Tesco",
]
def test_park_group_excludes_playgrounds_and_play_space() -> None:
# "Play Space" (playgrounds) must not count as a Park; Public Park Or
# Garden and Playing Field (open recreation grounds) are in scope.
assert GREENSPACE_PARK_FUNCTIONS == {
"parks": ["Public Park Or Garden", "Playing Field"]
}
def test_greenspace_count_frame_collapses_to_one_row_per_site() -> None:
# Three gates of one park (with a site centroid), one gate of another park
# without a centroid, and one centroid-fallback row with a null site_id.
greenspace = pl.DataFrame(
{
"lat": [51.50, 51.51, 51.52, 53.0, 54.0],
"lng": [-0.10, -0.11, -0.12, -2.0, -3.0],
"category": ["Public Park Or Garden"] * 3
+ ["Playing Field", "Public Park Or Garden"],
"site_id": ["site-a", "site-a", "site-a", "site-b", None],
"site_lat": [51.505, 51.505, 51.505, None, None],
"site_lng": [-0.105, -0.105, -0.105, None, None],
}
)
result = _greenspace_count_frame(greenspace).sort("lat")
# One row per site (site-a collapses 3 → 1), null-site rows preserved.
assert result.height == 3
site_a = result.filter(pl.col("site_id") == "site-a")
# The representative point is the site centroid…
assert site_a["lat"].to_list() == [51.505]
assert site_a["lng"].to_list() == [-0.105]
# …or the first access point when no centroid is available.
site_b = result.filter(pl.col("site_id") == "site-b")
assert site_b["lat"].to_list() == [53.0]
def test_greenspace_count_frame_passes_legacy_parquet_through() -> None:
# The shipped parquet predates the site_id column; counting must not crash
# (it keeps the old access-point grain until regenerated).
legacy = pl.DataFrame(
{
"lat": [51.50, 51.51],
"lng": [-0.10, -0.11],
"category": ["Public Park Or Garden", "Play Space"],
}
)
assert _greenspace_count_frame(legacy).equals(legacy)

View file

@ -0,0 +1,354 @@
import numpy as np
import polars as pl
from pipeline.transform.school_catchments import (
capacity_fill_radii,
children_per_postcode,
classify_good_plus_schools,
count_covering_catchments,
equilibrium_cutoffs,
phase_intakes,
school_preference_bonuses,
)
def _school(phase, oeif, ungraded, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["urn"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, 1),
_school("Primary", "2", None, 2),
_school("Secondary", "1", None, 3),
_school("Secondary", "2", None, 4),
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_primary"),
(3, "outstanding_secondary"),
(4, "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", 1),
_school("Secondary", "Not judged", "School remains Outstanding", 2),
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", 3),
]
assert _classify(rows) == {
(1, "good_primary"),
(2, "outstanding_secondary"),
(3, "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", 1),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
2,
),
]
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, urn=100000):
return {
"URN": urn,
"Postcode": "AA1 1AA",
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, 1)]
assert _classify(rows) == {
(1, "good_primary"),
(1, "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, 1), # primary only
_aged_school("Secondary", "2", 11, 16, 2), # secondary only
_aged_school("Secondary", "1", 9, 13, 3), # middle -> both
]
assert _classify(rows) == {
(1, "outstanding_primary"),
(2, "good_secondary"),
(3, "outstanding_primary"),
(3, "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, 111),
_aged_school("Secondary", "2", 11, 16, 222),
]
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["urn"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {(111, "outstanding_primary")}
def _gias_row(
urn,
type_group="Academies",
age_range="411",
pupils=210,
capacity=None,
admissions_policy=None,
):
return {
"urn": urn,
"name": f"School {urn}",
"lat": 51.5,
"lng": -0.1,
"type_group": type_group,
"age_range": age_range,
"pupils": pupils,
"capacity": capacity,
"admissions_policy": admissions_policy,
}
def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
intakes = phase_intakes(
pl.DataFrame(
[
# 4-11 = cohorts 4..10, all 7 primary: full fill target.
_gias_row(1, age_range="411", pupils=210),
# 11-16 = cohorts 11..15, all 5 secondary.
_gias_row(2, age_range="1116", pupils=500),
# 3-11 = cohorts 3..10; nursery year weighs 0.5, so primary
# gets 7 of 7.5 cohort weights.
_gias_row(3, age_range="311", pupils=240),
# All-through 4-16 = cohorts 4..15: 7/12 primary, 5/12 secondary.
_gias_row(4, age_range="416", pupils=1200),
# 11-18 = cohorts 11..17; sixth-form years weigh 0.6 each, so
# secondary gets 5 of 6.2 cohort weights.
_gias_row(5, age_range="1118", pupils=1240),
]
)
).sort("urn")
assert intakes["primary_intake"].to_list() == [210.0, 0.0, 224.0, 700.0, 0.0]
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
def test_phase_intakes_excludes_non_state_and_selective_schools():
intakes = phase_intakes(
pl.DataFrame(
[
_gias_row(1, type_group="Independent schools"),
_gias_row(2, type_group="Special schools"),
_gias_row(3, type_group="Welsh schools"),
# Grammar school intakes are test-based and region-wide; a
# distance catchment would be fabricated.
_gias_row(4, admissions_policy="Selective"),
_gias_row(5, pupils=None, capacity=300),
_gias_row(6, pupils=None, capacity=None), # no usable headcount
_gias_row(7, age_range=None), # no parsable cohorts
# Over-full school keeps its demonstrated size.
_gias_row(8, pupils=350, capacity=300),
_gias_row(9, admissions_policy="Non-selective"),
]
)
).sort("urn")
assert intakes["urn"].to_list() == [5, 8, 9]
assert intakes["primary_intake"].to_list() == [300.0, 350.0, 210.0]
def test_school_preference_bonuses_follow_derived_grade():
rows = [
{**_school("Primary", "1", None, 1)},
{**_school("Primary", "2", None, 2)},
{**_school("Primary", "3", None, 3)},
{**_school("Primary", "4", None, 4)},
{**_school("Primary", None, "Some aspects not as strong", 5)}, # unrated
{**_school("Primary", "Not judged", "School remains Good", 6)},
]
bonuses = dict(
school_preference_bonuses(
pl.DataFrame(rows), bonus_outstanding_km=1.0, bonus_good_km=0.5
).iter_rows()
)
assert bonuses == {1: 1.0, 2: 0.5, 3: -0.5, 4: -1.0, 5: 0.0, 6: 0.5}
def test_children_per_postcode_prorates_bands_and_splits_lsoa_evenly():
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AB", "BB2 2BB"],
"lat": [51.5, 51.5, 52.0],
"lng": [-0.1, -0.1, -0.2],
"lsoa21cd": ["E01000001", "E01000001", "E01000002"],
}
)
lsoa_children = pl.DataFrame(
{
"lsoa21": ["E01000001", "E01000002"],
"aged_0_4": [100, 30],
"aged_5_9": [100, 10],
"aged_10_14": [100, 20],
"aged_15_19": [100, 40],
}
)
result = children_per_postcode(postcodes, lsoa_children).sort("postcode")
# Primary 4-10 = 0.2*aged_0_4 + aged_5_9 + 0.2*aged_10_14: 140 split across
# the LSOA's 2 postcodes; 20 for the single-postcode LSOA.
assert result["primary_children"].to_list() == [70.0, 70.0, 20.0]
# Secondary 11-15 = 0.8*aged_10_14 + 0.2*aged_15_19: 100 split across 2; 24.
assert result["secondary_children"].to_list() == [50.0, 50.0, 24.0]
def test_equilibrium_cutoff_tightens_to_marginal_admitted_distance():
# One school with 10 places; postcodes at 1km, 2km and 3km with 5 children
# each. The two nearest postcodes exactly fill it, so the cutoff is the
# marginal admitted child's distance and the 3km postcode is shut out.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0]]),
np.array([10.0]),
np.array([0.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs.tolist() == [2.0]
def test_equilibrium_rejected_demand_cascades_to_next_school():
# School A (5 places) at the origin, school B (5 places) at 10km.
# P1 (1km, 5 children) and P2 (1.5km, 5 children) both prefer A; A fills
# with P1 and tightens its cutoff to 1km, pushing P2 out to B. B never
# exceeds its target, so it keeps no binding cutoff.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [10.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0], [1.5, 0.0]]),
np.array([5.0, 5.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_preference_bonus_steers_demand_to_better_school():
# Two schools equidistant from the only postcode; school A is rated
# better (0.5km bonus) so all children choose it; B attracts nobody.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([5.0, 5.0]),
np.array([0.5, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=0.0,
)
assert cutoffs[0] == 1.0
assert np.isinf(cutoffs[1])
def test_equilibrium_logit_choice_smears_demand_across_schools():
# With a positive temperature some families prefer the further school, so
# both schools receive applications: the near school still fills and keeps
# a binding cutoff, and the far school now attracts mass it would never
# see under deterministic choice.
cutoffs = equilibrium_cutoffs(
np.array([[0.0, 0.0], [2.0, 0.0]]),
np.array([4.0, 4.0]),
np.array([0.0, 0.0]),
np.array([[1.0, 0.0]]),
np.array([10.0]),
tau_km=1.0,
)
# Each school gets half the 10 children (equidistant, equal utility),
# exceeding both fill targets: both cutoffs bind at the postcode.
assert cutoffs.tolist() == [1.0, 1.0]
def test_capacity_fill_radii_covers_fill_target_population():
# Unfilled school needs 6 children: postcodes at 1km (5) and 2km (5)
# cumulate past the target at 2km. A school needing more children than
# exist within the cap keeps the cap.
radii = capacity_fill_radii(
np.array([[0.0, 0.0], [0.0, 0.0]]),
np.array([6.0, 1000.0]),
np.array([[1.0, 0.0], [2.0, 0.0], [3.0, 0.0]]),
np.array([5.0, 5.0, 5.0]),
max_radius_km=25.0,
)
assert radii.tolist() == [2.0, 25.0]
def test_count_covering_catchments_respects_radius_and_validity():
pc_xy = np.array([[0.0, 0.0], [3.0, 0.0], [10.0, 0.0], [0.5, 0.0]])
pc_valid = np.array([True, True, True, False])
school_xy = np.array([[0.0, 0.0], [2.0, 0.0]])
radii = np.array([4.0, 1.5])
counts = count_covering_catchments(pc_xy, pc_valid, school_xy, radii, 4)
# pc0 is inside school 0 only (school 1 is 2km away > 1.5km radius);
# pc1 inside both; pc2 inside neither; pc3 invalid -> 0 despite proximity.
assert counts.tolist() == [1, 2, 0, 0]
def test_count_covering_catchments_empty_schools():
counts = count_covering_catchments(
np.zeros((2, 2)), np.array([True, True]), np.empty((0, 2)), np.empty(0), 2
)
assert counts.tolist() == [0, 0]

View file

@ -1,139 +0,0 @@
import polars as pl
from pipeline.transform.school_proximity import classify_good_plus_schools
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": ungraded,
}
def _classify(rows):
result = classify_good_plus_schools(pl.DataFrame(rows))
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
def test_legacy_oeif_grades_1_and_2_are_kept():
rows = [
_school("Primary", "1", None, "AA1 1AA"),
_school("Primary", "2", None, "AA1 1AB"),
_school("Secondary", "1", None, "AA1 1AC"),
_school("Secondary", "2", None, "AA1 1AD"),
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_primary"),
("AA1 1AC", "outstanding_secondary"),
("AA1 1AD", "good_secondary"),
}
def test_grades_3_and_4_are_excluded():
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
assert _classify(rows) == set()
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
# Null and "Not judged" OEIF fall back to the ungraded outcome.
rows = [
_school("Primary", None, "School remains Good", "AA1 1AA"),
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
# "(Improving)" is still good+ ...
_school("Primary", None, "School remains Good (Improving) - S5 Next", "AA1 1AE"),
]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AB", "outstanding_secondary"),
("AA1 1AE", "good_primary"),
}
def test_ungraded_concerns_are_not_good_plus():
# "(Concerns)" outcomes signal issues warranting earlier re-inspection and
# must NOT be counted as good+ schools.
rows = [
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
_school(
"Secondary",
None,
"School remains Outstanding (Concerns) - S5 Next",
"AA1 1AD",
),
]
assert _classify(rows) == set()
def test_ungraded_non_good_outcomes_are_excluded():
rows = [
_school("Primary", None, "Some aspects not as strong"),
_school("Primary", None, "Standards maintained"),
_school("Primary", None, None),
]
assert _classify(rows) == set()
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
# A real grade 3 must not be promoted by an ungraded "remains Good".
rows = [_school("Primary", "3", "School remains Good")]
assert _classify(rows) == set()
def test_non_primary_secondary_phases_excluded():
rows = [
_school("Nursery", "1", None),
_school("Not applicable", "2", None),
]
assert _classify(rows) == set()
def _aged_school(phase, oeif, low, high, postcode="AA1 1AA"):
return {
"Postcode": postcode,
"Ofsted phase": phase,
"Latest OEIF overall effectiveness": oeif,
"Ungraded inspection overall outcome": None,
"URN": 100000,
"Statutory lowest age": low,
"Statutory highest age": high,
}
def test_all_through_school_counts_toward_both_primary_and_secondary():
# An all-through school (age 3-18) is labelled "Secondary" by Ofsted phase but
# serves primary-age children too, so it must count in BOTH metrics.
rows = [_aged_school("Secondary", "2", 3, 18, "AA1 1AA")]
assert _classify(rows) == {
("AA1 1AA", "good_primary"),
("AA1 1AA", "good_secondary"),
}
def test_age_ranges_assign_single_phase_for_standard_schools():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"), # primary only
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"), # secondary only
_aged_school("Secondary", "1", 9, 13, "AA1 1AC"), # middle -> both
]
assert _classify(rows) == {
("AA1 1AA", "outstanding_primary"),
("AA1 1AB", "good_secondary"),
("AA1 1AC", "outstanding_primary"),
("AA1 1AC", "outstanding_secondary"),
}
def test_closed_schools_excluded_when_open_register_given():
rows = [
_aged_school("Primary", "1", 4, 11, "AA1 1AA"),
_aged_school("Secondary", "2", 11, 16, "AA1 1AB"),
]
rows[0]["URN"] = 111
rows[1]["URN"] = 222
result = classify_good_plus_schools(pl.DataFrame(rows), open_urns={111})
pairs = {(r["postcode"], r["category"]) for r in result.to_dicts()}
# URN 222 is not in the open register, so it is dropped.
assert pairs == {("AA1 1AA", "outstanding_primary")}

View file

@ -544,6 +544,142 @@ def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
assert n2_grocery.height == 1
def test_transform_drops_miscategorised_tags(tmp_path):
# Audit 2026-06-10: these tags polluted Entertainment (cycle-hire docks,
# slipways, marinas), Gallery (public artwork), Pharmacy (herbalists,
# alternative medicine), Hospital & Clinic (untyped healthcare/yes),
# Tourist Attraction (fountains, courthouses) and Gym & Fitness (outdoor
# apparatus). They must be dropped entirely.
dropped = [
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
"tourism/artwork",
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
"amenity/fountain",
"amenity/courthouse",
"leisure/fitness_station",
]
raw = pl.DataFrame(
{
"id": [f"n{i}" for i in range(len(dropped))],
"name": [f"POI {i}" for i in range(len(dropped))],
"category": dropped,
"lat": [51.50] * len(dropped),
"lng": [-0.10] * len(dropped),
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id").is_in(raw["id"].to_list())).height == 0
def test_transform_splits_hospital_and_clinic(tmp_path):
raw = pl.DataFrame(
{
"id": ["n1", "n2", "n3"],
"name": ["St Thomas'", "Vale Surgery Annexe", "Drop-in Centre"],
"category": [
"amenity/hospital",
"amenity/clinic",
"healthcare/clinic",
],
"lat": [51.50, 51.51, 51.52],
"lng": [-0.10, -0.11, -0.12],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hospital"]
assert out.filter(pl.col("id") == "n2")["category"].to_list() == ["Clinic"]
assert out.filter(pl.col("id") == "n3")["category"].to_list() == ["Clinic"]
assert "Hospital & Clinic" not in out["category"].to_list()
def test_transform_maps_chalet_to_hotel(tmp_path):
# Holiday-let chalets are accommodation, not Tourist Attractions.
raw = pl.DataFrame(
{
"id": ["n1"],
"name": ["Seaview Chalet"],
"category": ["tourism/chalet"],
"lat": [51.50],
"lng": [-0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id") == "n1")["category"].to_list() == ["Hotel"]
def test_transform_name_gates_track_horse_riding_fishing(tmp_path):
# leisure/track, leisure/horse_riding and leisure/fishing are 83-84%
# unnamed (anonymous tracks/gallops/fishing spots); only named public
# facilities survive as a Sports Centre.
raw = pl.DataFrame(
{
"id": ["n1", "n2", "n3", "n4"],
"name": [None, "", "Herne Hill Velodrome", "Royal Mews Riding School"],
"category": [
"leisure/track",
"leisure/fishing",
"leisure/track",
"leisure/horse_riding",
],
"lat": [51.50, 51.51, 51.52, 51.53],
"lng": [-0.10, -0.11, -0.12, -0.13],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.filter(pl.col("id").is_in(["n1", "n2"])).height == 0
named = out.filter(pl.col("id").is_in(["n3", "n4"]))
assert named["category"].to_list() == ["Sports Centre", "Sports Centre"]
def test_transform_passes_through_tram_metro_naptan_category(tmp_path):
# NaPTAN now emits "Tram & Metro stop" (non-LU TMU/MET networks); it must
# flow through with the Public Transport group and its own emoji.
raw = pl.DataFrame(
{
"id": ["n1"],
"name": ["A Cafe"],
"category": ["amenity/cafe"],
"lat": [51.50],
"lng": [-0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
pl.DataFrame(
{
"id": ["naptan-1", "naptan-2"],
"name": ["Test Rail Station", "Weaste"],
"category": ["Rail station", "Tram & Metro stop"],
"lat": [51.51, 51.52],
"lng": [-0.13, -0.14],
}
).write_parquet(inputs["naptan_path"])
out = transform(**inputs).collect()
tram = out.filter(pl.col("category") == "Tram & Metro stop")
assert tram.height == 1
assert tram["group"].to_list() == ["Public Transport"]
assert tram["emoji"].to_list() == ["🚊"]
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.

View file

@ -86,6 +86,28 @@ DROP_CATEGORIES = {
"amenity/water_point",
"amenity/watering_place",
"amenity/weighbridge",
# Boating/cycle-hire infrastructure formerly miscategorised as
# "Entertainment" (46% of the bucket): cycle-hire dock stations, boat
# ramps and moorings are not entertainment venues.
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# Public art (statues, murals, village signs) formerly 93% of "Gallery".
"tourism/artwork",
# Outdoor exercise apparatus (pull-up bars, trim trails) formerly inflating
# "Gym & Fitness".
"leisure/fitness_station",
# Untyped healthcare rows and non-pharmacy health shops formerly bucketed
# under "Hospital & Clinic" / "Pharmacy".
"healthcare/yes",
"healthcare/alternative",
"shop/herbalist",
"shop/health",
# Street fountains and courthouses formerly bucketed as
# "Tourist Attraction".
"amenity/fountain",
"amenity/courthouse",
# Niche amenities not useful for home buyers
"amenity/animal_boarding",
"amenity/animal_breeding",
@ -373,10 +395,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/tanning_salon",
"shop/amusements",
"tourism/theme_park",
"amenity/bicycle_rental",
"amenity/boat_rental",
"leisure/marina",
"leisure/slipway",
# bicycle_rental/boat_rental/marina/slipway used to live here and
# made up ~46% of the bucket (cycle-hire docks, boat ramps); they
# are infrastructure, not entertainment venues — see DROP_CATEGORIES.
"leisure/hackerspace",
"leisure/yes",
],
@ -699,7 +720,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🏋️",
[
"leisure/fitness_centre",
"leisure/fitness_station",
# leisure/fitness_station (outdoor pull-up bars / trim-trail
# apparatus, ~2.5k) is not a gym — see DROP_CATEGORIES.
"amenity/dojo",
"amenity/dancing_school",
],
@ -825,28 +847,37 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"amenity/pharmacy",
"healthcare/pharmacy",
"shop/chemist",
"shop/herbalist",
"shop/health",
"healthcare/alternative",
# healthcare/alternative, shop/herbalist and shop/health (homeopaths,
# herbalists, generic "health" shops) are not dispensing pharmacies
# — see DROP_CATEGORIES.
],
),
# "Hospital & Clinic" used to be one bucket; an actual hospital and a small
# clinic are very different amenities for a homebuyer, so they are split.
(
"Health",
"Hospital",
"🏥",
[
"amenity/hospital",
"healthcare/hospital",
],
),
(
"Health",
"Hospital & Clinic",
"🏥",
"Clinic",
"🩺",
[
"amenity/hospital",
"amenity/clinic",
"amenity/health_centre",
"healthcare/blood_donation",
"healthcare/hospital",
"healthcare/centre",
"healthcare/clinic",
"office/healthcare",
"healthcare/laboratory",
"healthcare/rehabilitation",
"healthcare/vaccination_centre",
"healthcare/yes",
# healthcare/yes (untyped junk rows) is dropped — see DROP_CATEGORIES.
],
),
(
@ -917,7 +948,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"🖼️",
[
"tourism/gallery",
"tourism/artwork",
# tourism/artwork (statues, murals, village signs) was 93% of this
# bucket and is not a visitable gallery — see DROP_CATEGORIES.
],
),
(
@ -961,9 +993,8 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
[
"tourism/attraction",
"tourism/aquarium",
"amenity/fountain",
"amenity/courthouse",
"tourism/chalet",
# amenity/fountain (street furniture) and amenity/courthouse are
# dropped; tourism/chalet (holiday lets) moved to "Hotel".
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
@ -982,6 +1013,9 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/resort",
"tourism/holiday_park",
"tourism/self_catering",
# Holiday-let chalets are accommodation, not tourist attractions
# (where they previously sat).
"tourism/chalet",
],
),
(
@ -1162,6 +1196,11 @@ REQUIRE_NAME_CATEGORIES = {
"leisure/practice_pitch",
"leisure/swimming_pool",
"leisure/paddling_pool",
# 83-84% unnamed: anonymous running tracks, private gallops/paddocks and
# fishing spots; only named public facilities count as a Sports Centre.
"leisure/track",
"leisure/horse_riding",
"leisure/fishing",
}
@ -1181,6 +1220,7 @@ NAPTAN_EMOJIS: dict[str, str] = {
"Bus station": "🚌",
"Taxi rank": "🚕",
"Tube station": "🚇",
"Tram & Metro stop": "🚊",
}
@ -1438,9 +1478,9 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
school_catchments.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_proximity: the ungraded outcome carries "School remains Good"/
# See school_catchments: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)

View file

@ -55,6 +55,7 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
&[
"Rail station",
"Tube station",
"Tram & Metro stop",
"Bus station",
"Bus stop",
"Airport",
@ -79,7 +80,7 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
),
(
"Health",
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
&["GP Surgery", "Pharmacy", "Dentist", "Hospital", "Clinic"],
),
(
"Leisure",

View file

@ -180,20 +180,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
raw: false,
absolute: true,
}),
Feature::Enum(EnumFeatureConfig {
name: "Within conservation area",
order: Some(&["Yes", "No"]),
description: "Whether the postcode point falls inside a designated conservation area",
detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.",
source: "conservation-areas",
}),
Feature::Enum(EnumFeatureConfig {
name: "Listed building",
order: Some(&["Yes", "No"]),
description: "Whether this property appears to match a Historic England listed building entry",
detail: "Historic England National Heritage List for England listed-building points, matched conservatively to property addresses using the listed-entry name and nearby postcode candidates. Treat this as a screening signal, not a legal determination: verify any specific property on the NHLE and with the local planning authority.",
source: "listed-buildings",
}),
Feature::Numeric(FeatureConfig {
name: "Noise (dB)",
bounds: Bounds::Fixed {
@ -209,6 +195,20 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
raw: false,
absolute: false,
}),
Feature::Enum(EnumFeatureConfig {
name: "Within conservation area",
order: Some(&["Yes", "No"]),
description: "Whether the postcode point falls inside a designated conservation area",
detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.",
source: "conservation-areas",
}),
Feature::Enum(EnumFeatureConfig {
name: "Listed building",
order: Some(&["Yes", "No"]),
description: "Whether this property appears to match a Historic England listed building entry",
detail: "Historic England National Heritage List for England listed-building points, matched conservatively to property addresses using the listed-entry name and nearby postcode candidates. Treat this as a screening signal, not a legal determination: verify any specific property on the NHLE and with the local planning authority.",
source: "listed-buildings",
}),
],
},
FeatureGroup {
@ -307,89 +307,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
name: "Schools",
features: &[
Feature::Numeric(FeatureConfig {
name: "Good+ primary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 10.0,
},
step: 1.0,
description: "Primary schools rated Good or Outstanding by Ofsted within 2km",
detail: "State-funded primary schools within 2km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Good+ secondary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 5.0,
},
step: 1.0,
description: "Secondary schools rated Good or Outstanding by Ofsted within 2km",
detail: "State-funded secondary schools within 2km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding primary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 10.0,
},
step: 1.0,
description: "Primary schools rated Outstanding by Ofsted within 2km",
detail: "State-funded primary schools within 2km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding secondary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 5.0,
},
step: 1.0,
description: "Secondary schools rated Outstanding by Ofsted within 2km",
detail: "State-funded secondary schools within 2km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Good+ primary schools within 5km",
bounds: Bounds::Fixed {
min: 0.0,
max: 30.0,
},
step: 1.0,
description: "Primary schools rated Good or Outstanding by Ofsted within 5km",
detail: "State-funded primary schools within 5km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Good+ secondary schools within 5km",
name: "Good+ primary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 15.0,
},
step: 1.0,
description: "Secondary schools rated Good or Outstanding by Ofsted within 5km",
detail: "State-funded secondary schools within 5km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
description: "Primary schools rated Good or Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded primary schools with a current Ofsted rating of Good or Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
@ -397,14 +322,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding primary schools within 5km",
name: "Good+ secondary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 30.0,
max: 11.0,
},
step: 1.0,
description: "Primary schools rated Outstanding by Ofsted within 5km",
detail: "State-funded primary schools within 5km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
description: "Secondary schools rated Good or Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded secondary schools with a current Ofsted rating of Good or Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
@ -412,14 +337,29 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding secondary schools within 5km",
name: "Outstanding primary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 15.0,
max: 8.0,
},
step: 1.0,
description: "Secondary schools rated Outstanding by Ofsted within 5km",
detail: "State-funded secondary schools within 5km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
description: "Primary schools rated Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded primary schools with a current Ofsted rating of Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding secondary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 4.0,
},
step: 1.0,
description: "Secondary schools rated Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded secondary schools with a current Ofsted rating of Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",

View file

@ -62,6 +62,42 @@ pub struct AiFiltersResponse {
notes: String,
/// Number of properties matching the proposed property and travel time filters.
match_count: usize,
/// Bounding box of the matching properties so the client can move the
/// camera to where matches actually are. Absent when nothing matches.
#[serde(skip_serializing_if = "Option::is_none")]
match_bounds: Option<MatchBounds>,
}
#[derive(Serialize)]
pub struct MatchBounds {
south: f32,
west: f32,
north: f32,
east: f32,
}
/// Bounding box over matched coordinates, trimmed to the 5th95th percentile
/// per axis (when there are enough points) so a handful of remote outliers
/// doesn't zoom the camera out to all of England.
fn percentile_trimmed_bounds(mut lats: Vec<f32>, mut lons: Vec<f32>) -> Option<MatchBounds> {
if lats.is_empty() || lats.len() != lons.len() {
return None;
}
lats.sort_unstable_by(f32::total_cmp);
lons.sort_unstable_by(f32::total_cmp);
let last = lats.len() - 1;
let (lo, hi) = if lats.len() >= 20 {
let trim = lats.len() / 20;
(trim, last - trim)
} else {
(0, last)
};
Some(MatchBounds {
south: lats[lo],
north: lats[hi],
west: lons[lo],
east: lons[hi],
})
}
/// Strip markdown code fences (```json ... ``` or ``` ... ```) from LLM output.
@ -90,17 +126,12 @@ fn school_feature_name_from_key(name: &str) -> Option<&'static str> {
let mut parts = rest.split(':');
let phase = parts.next()?;
let rating = parts.next()?;
let distance = parts.next()?;
match (phase, rating, distance) {
("primary", "good", "2") => Some("Good+ primary schools within 2km"),
("secondary", "good", "2") => Some("Good+ secondary schools within 2km"),
("primary", "outstanding", "2") => Some("Outstanding primary schools within 2km"),
("secondary", "outstanding", "2") => Some("Outstanding secondary schools within 2km"),
("primary", "good", "5") => Some("Good+ primary schools within 5km"),
("secondary", "good", "5") => Some("Good+ secondary schools within 5km"),
("primary", "outstanding", "5") => Some("Outstanding primary schools within 5km"),
("secondary", "outstanding", "5") => Some("Outstanding secondary schools within 5km"),
match (phase, rating) {
("primary", "good") => Some("Good+ primary school catchments"),
("secondary", "good") => Some("Good+ secondary school catchments"),
("primary", "outstanding") => Some("Outstanding primary school catchments"),
("secondary", "outstanding") => Some("Outstanding secondary school catchments"),
_ => None,
}
}
@ -508,8 +539,8 @@ pub fn build_system_prompt(
{\"name\": \"Serious crime (avg/yr)\", \"bound\": \"max\", \"value\": 5}, \
{\"name\": \"Minor crime (avg/yr)\", \"bound\": \"max\", \"value\": 20}, \
{\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary school catchments\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Number of amenities (Park) within 2km\", \"bound\": \"min\", \"value\": 3}], \
\"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
.to_string(),
@ -519,8 +550,8 @@ pub fn build_system_prompt(
"\nUser: \"quiet area with outstanding schools\"\n\
Output: {\"numeric_filters\": [\
{\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
{\"name\": \"Outstanding primary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Outstanding secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
{\"name\": \"Outstanding primary school catchments\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Outstanding secondary school catchments\", \"bound\": \"min\", \"value\": 1}], \
\"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
.to_string(),
);
@ -557,8 +588,8 @@ pub fn build_system_prompt(
Output: {\"numeric_filters\": [\
{\"name\": \"Total floor area (sqm)\", \"bound\": \"min\", \"value\": 100}, \
{\"name\": \"Number of bedrooms & living rooms\", \"bound\": \"min\", \"value\": 5}, \
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
{\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary school catchments\", \"bound\": \"min\", \"value\": 1}], \
\"enum_filters\": [{\"name\": \"Property type\", \
\"values\": [\"Detached\", \"Semi-Detached\"]}], \
\"travel_time_filters\": [{\"mode\": \"car\", \"slug\": \"manchester\", \
@ -592,7 +623,7 @@ pub fn build_system_prompt(
"\nUser: \"3 bed house under 500k with good schools\"\n\
Output: {\
\"numeric_filters\": [{\"name\": \"Estimated current price\", \"bound\": \"max\", \"value\": 500000}, \
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}], \
{\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}], \
\"enum_filters\": [{\"name\": \"Property type\", \
\"values\": [\"Detached\", \"Semi-Detached\", \"Terraced\"]}], \
\"travel_time_filters\": [], \
@ -759,7 +790,7 @@ fn count_matching_rows(
state: &AppState,
filters: &Value,
travel_time_filters: &[TravelTimeFilter],
) -> usize {
) -> (usize, Option<MatchBounds>) {
let filter_str = filters_to_filter_string(filters);
let quant = state.data.quant_ref();
@ -778,7 +809,7 @@ fn count_matching_rows(
Ok(f) => f,
Err(err) => {
warn!("Failed to parse filters for match count: {err}");
return 0;
return (0, None);
}
}
};
@ -800,6 +831,8 @@ fn count_matching_rows(
let has_poi_filters = !parsed_poi_filters.is_empty();
let mut count = 0usize;
let mut matched_lats: Vec<f32> = Vec::new();
let mut matched_lons: Vec<f32> = Vec::new();
for (row, pc_key) in pc_keys.iter().enumerate().take(num_rows) {
if !row_passes_filters(
row,
@ -836,9 +869,11 @@ fn count_matching_rows(
}
count += 1;
matched_lats.push(state.data.lat[row]);
matched_lons.push(state.data.lon[row]);
}
count
(count, percentile_trimmed_bounds(matched_lats, matched_lons))
}
/// Budget limits for the Gemini conversation loop. Separate counters prevent
@ -1132,7 +1167,7 @@ pub async fn post_ai_filters(
.to_string();
// Count matching properties and refine if too restrictive
let match_count = count_matching_rows(&state, &filters, &travel_time_filters);
let (match_count, match_bounds) = count_matching_rows(&state, &filters, &travel_time_filters);
info!(
match_count = match_count,
round = round,
@ -1173,6 +1208,7 @@ pub async fn post_ai_filters(
travel_time_filters,
notes,
match_count: 0,
match_bounds: None,
}));
}
@ -1236,6 +1272,7 @@ pub async fn post_ai_filters(
travel_time_filters,
notes,
match_count,
match_bounds,
}));
}
@ -1488,9 +1525,14 @@ mod tests {
#[test]
fn synthetic_filter_keys_are_normalized_to_backend_names() {
assert_eq!(
canonical_filter_name("Schools:primary:good:0"),
"Good+ primary school catchments"
);
// Legacy keys still carry a distance segment; it is ignored.
assert_eq!(
canonical_filter_name("Schools:primary:good:2:0"),
"Good+ primary schools within 2km"
"Good+ primary school catchments"
);
assert_eq!(
canonical_filter_name("Specific crimes:Burglary%20%28avg%2Fyr%29:1"),

View file

@ -68,8 +68,11 @@ pub async fn get_filter_counts(
let num_total_filters = num_regular + travel_filter_indices.len();
if num_total_filters == 0 {
// With no active filters the total is simply every property in bounds.
// count_in_bounds is O(grid cells), far cheaper than walking every row.
let total = state.grid.count_in_bounds(south, west, north, east) as u32;
return Ok(Json(FilterCountsResponse {
total: 0,
total,
impacts: FxHashMap::default(),
}));
}