This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -95,11 +95,14 @@ def transform_crime(
f"({valid_months[0]} to {valid_months[-1]})"
)
# Count monthly incidents, then annualise over every valid month in the dataset.
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
# don't know which child a given incident actually belonged to.
yearly_counts = (
# Annualise each year separately (count_in_year * 12 / months_in_year), then
# take the simple mean of those per-year rates over the years each type is
# present. This makes the headline equal the average of the by-year chart bars
# (_write_crime_by_year) instead of a month-weighted pooled rate, mirroring
# crime_spatial._write_avg_yr. `_weight` (≤1) comes from the LSOA 2011→2021
# lookup: 2011 LSOAs that split into N 2021 LSOAs contribute 1/N of their count
# to each child, since we don't know which child an incident actually belonged to.
filtered = (
df.filter(
valid_month_expr
& pl.col("LSOA code").is_not_null()
@ -107,15 +110,31 @@ def transform_crime(
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
)
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("LSOA code", "Month", "Crime type")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type")
.agg(
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
.round(1)
.alias("yearly_avg")
.with_columns(
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
)
)
# Months observed *anywhere* in the dataset for each year (annualisation
# denominator), matching the by-year output's per-year scaling.
months_per_year = filtered.group_by("year").agg(
pl.col("Month").n_unique().alias("months_in_year")
)
yearly_counts = (
filtered.group_by("LSOA code", "year", "Crime type", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.col("count").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
)
# Mean of the per-year annualised rates over the years the type is present
# (only years with rows are grouped here, so this is the correct x-span).
.group_by("LSOA code", "Crime type")
.agg(pl.col("per_year").mean().round(1).alias("yearly_avg"))
.collect(engine="streaming")
)
if yearly_counts.is_empty():

View file

@ -259,11 +259,14 @@ def _write_avg_yr(
"""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
# Average over the years each type is actually observed anywhere -- the same
# per-type x-span the by-year chart plots (server-rs/.../crime_by_year.rs).
type_year_present = counts.sum(axis=0) > 0 # (n_types, n_years)
years_per_type = np.clip(type_year_present.sum(axis=1), 1, None).astype(np.float64)
avg = per_year.sum(axis=2) / years_per_type[None, :] # (n_postcodes, n_types)
# Average over the years *this postcode* actually has incidents of *this
# type* -- the same per-(postcode, type) x-span the by-year chart plots
# (server-rs/.../crime_by_year.rs), so the headline equals the mean of the
# by-year bars. Dividing by a global years-present count (years a type
# appeared anywhere in England) would deflate postcodes whose incidents
# cluster in only a few years of the ~13-year window.
years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64)
avg = per_year.sum(axis=2) / years_present # (n_postcodes, n_types)
avg = np.round(avg * norm[:, None], 1).astype(np.float32)
data: dict[str, np.ndarray] = {"postcode": postcodes}

View file

@ -365,6 +365,16 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
}
duration_map = {"F": "Freehold", "L": "Leasehold"}
# price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
# VALUE-QUALITY filters: they gate the price aggregations only. Category B
# entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
# sales must not pollute latest_price / historical_prices (and the downstream
# price-per-sqm feature), but they MUST still count for first_transfer_date /
# old_new so a new-build's genuine earliest transfer year is preserved.
price_ok = pl.col("price") >= MIN_PRICE
category_ok = pl.col("ppd_category") == "A"
quality_ok = price_ok & category_ok
price_paid = (
pl.scan_parquet(price_paid_path)
.select(
@ -381,9 +391,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
"town_city",
pl.col("duration").replace(duration_map),
"old_new",
"ppd_category",
)
.filter(pl.col("pp_property_type") != "Other")
.filter(pl.col("price") >= MIN_PRICE)
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
@ -408,18 +418,26 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
pl.col("postcode").last(),
pl.col("_pp_match_address").last(),
pl.col("_pp_match_postcode").last(),
# Price aggregations are restricted to quality-passing sales.
pl.struct(
pl.col("date_of_transfer").dt.year().alias("year"),
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
"price",
).alias("historical_prices"),
)
.filter(quality_ok)
.alias("historical_prices"),
pl.col("pp_property_type").last(),
pl.col("duration").last(),
pl.col("price").last().alias("latest_price"),
pl.col("date_of_transfer").last(),
pl.col("price").filter(quality_ok).last().alias("latest_price"),
pl.col("date_of_transfer").filter(quality_ok).last(),
# first_transfer_date / old_new reflect the genuine earliest transfer
# over the full per-group transaction stream (not value-filtered).
pl.col("date_of_transfer").first().alias("first_transfer_date"),
pl.col("old_new").first(),
)
# Preserve the property universe: previously a property needed >=1 sale
# >=MIN_PRICE to form a group, so drop groups with no quality-passing sale.
.filter(pl.col("latest_price").is_not_null())
)
print("Price paid dataset")

View file

@ -839,25 +839,36 @@ def _join_area_side_tables(
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
base = base.join(crime, on="postcode", how="left")
serious_crime_cols = [
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
]
minor_crime_cols = [
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
]
# The LEFT join leaves every per-type column null for postcodes absent from
# the crime table; sum_horizontal alone would fabricate a "zero crime"
# rollup there, so keep the rollup null when ALL components are null.
base = base.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(serious_crime_cols))
.alias("serious_crime_avg_yr"),
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(minor_crime_cols))
.alias("minor_crime_avg_yr"),
)
base = base.join(median_age, on="lsoa21", how="left")
@ -1179,7 +1190,22 @@ def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataF
# pages); tolerate its absence so older parquets and test fixtures still
# load. Digits-only so it compares equal to the EPC register's UPRN.
if "UPRN" in raw.collect_schema().names():
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
# Mirror `_normalize_uprn` exactly so the listing key compares equal to
# the candidate-side key for every dtype. For a Float UPRN we must
# stringify via its integer form (100023336956.0 -> "100023336956"),
# otherwise stripping non-digits from "100023336956.0" yields a bogus
# trailing-zero key ("1000233369560") that never collides; and a
# non-integral float (e.g. 1.5) must be rejected rather than mangled.
uprn_col = pl.col("UPRN")
if raw.collect_schema()["UPRN"].is_float():
integral = uprn_col.cast(pl.Int64, strict=False)
uprn_digits = (
pl.when(integral == uprn_col)
.then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
.otherwise(None)
)
else:
uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
listing_uprn_expr = (
pl.when(uprn_digits.str.len_chars() > 0)
.then(uprn_digits)
@ -1615,9 +1641,23 @@ def _enrich_listings_with_direct_epc(
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
# The raw property-level value is fill_null("No") upstream, so a plain
# coalesce lets a non-null "No" override a directly-matched listing
# "Yes". "Former council house" should fire if EITHER side says so.
if raw_column == "was_council_house":
return (
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
.then(pl.lit("Yes"))
.otherwise(coalesce)
.alias(raw_column)
)
return coalesce.alias(raw_column)
return wide.with_columns(
[
pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
_coalesced(raw_column, direct_column)
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
]
)

View file

@ -12,11 +12,19 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_
# POI category groups for proximity counting (2km radius).
# Names must match the friendly names produced by transform_poi.py / naptan.py.
# "groceries" is filled in dynamically by _groceries_categories() because the
# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
# than the literal "Supermarket"; counting only the OSM strings here severely
# understates the metric. See _groceries_categories below.
POI_GROUPS_2KM = {
"restaurants": ["Restaurant", "Fast Food"],
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
}
# POI group whose members are counted for the static "groceries" 2km metric.
# Covers both the OSM grocery categories (Supermarket, Convenience Store,
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
GROCERIES_GROUP = "Groceries"
# OS Open Greenspace function types used for park counts and distance calculation.
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
# of green spaces that are only mapped as polygons in OSM.
@ -41,6 +49,26 @@ def _poi_category_slug(category: str) -> str:
return slug or "poi"
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
"""Return the distinct `category` values for the Groceries group.
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
with group "Groceries"; it never emits the literal "Supermarket". Collecting
every Groceries category captures both the OSM strings and the brand names.
"""
if "group" not in pois.columns:
raise ValueError("POI dataframe must include a 'group' column")
return (
pois.filter(pl.col("group") == GROCERIES_GROUP)
.select("category")
.unique()
.sort("category")
.to_series()
.to_list()
)
def _build_poi_category_groups(
pois: pl.DataFrame,
) -> tuple[dict[str, list[str]], dict[str, str]]:
@ -122,9 +150,15 @@ def main():
pois = pl.read_parquet(args.pois)
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
# Count static amenity groups within 2km.
# Count static amenity groups within 2km. "groceries" is matched against
# every Groceries category (OSM strings + GEOLYTIX brand names) so that
# postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
groups_2km = {
**POI_GROUPS_2KM,
"groceries": _groceries_categories(pois),
}
counts_2km = count_pois_per_postcode(
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
postcodes, pois, groups=groups_2km, radius_km=2
)
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for

View file

@ -77,9 +77,9 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen
### Phase 4: Merging and writing
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps the largest part **plus any other part ≥ `_MIN_DETACHED_PART_AREA` (100 m²)** (`_keep_polygon_parts`); only sub-100 m² noise slivers are dropped. Keeping substantial detached parts matters because a postcode genuinely split across an OA seam (by a railway, river, or main road wider than the 5m buffer) would otherwise lose a chunk — measured at ~1.8% of merged area left as uncovered gaps (often 30005000 m² building blocks) before this change.
**GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
**GeoJSON output** (`output.py:write_district_geojson`): Two passes. Pass 1 converts every postcode from BNG to WGS84 (pyproj), simplifies with 1m tolerance (Douglas-Peucker), and snaps to 6 decimal places (~0.1m precision); multi-part postcodes become `MultiPolygon` (`to_wgs84_geojson_multi`, each part handled independently), single-part stay `Polygon`. The whole set is then made a **partition** (`_resolve_overlaps`): each postcode is trimmed by the union of its higher-priority overlapping neighbours, where **priority = ascending area** (smaller postcodes win contested ground). That single rule handles both seam overlap *and* containment — an enclosed postcode is always smaller than its container, so it keeps its area while the container gets a hole (the query uses both the `overlaps` and `contains` predicates, since `overlaps` alone excludes containment). This runs last, so nothing re-introduces overlap; a postcode that would be emptied keeps its original geometry, so no active postcode is dropped. Pass 2 groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`), rounds coordinates to 6dp, and writes a `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
## Memory architecture
@ -103,10 +103,10 @@ Key design choices:
## Key invariants
1. **Every square meter of every OA is assigned to exactly one postcode** — the combination of INSPIRE claiming + Voronoi fills the entire OA, and overlap resolution ensures no double-counting
1. **No two postcodes cover the same ground in the output** — within an OA the INSPIRE claiming + Voronoi tile it with no overlap, and a final `_resolve_overlaps` partition pass removes the thin overlap strips that the merge buffer + per-postcode simplification introduce across OA seams (measured residual overlap ~0.01% of area)
2. **Every postcode that exists in the UPRN data gets a polygon** — unless all its UPRNs share coordinates with another postcode's UPRNs (handled by jitter) or it has zero UPRNs
3. **Postcode polygons never extend outside their OA(s)** — all geometry is clipped to OA boundaries
4. **Output is always single Polygon, never MultiPolygon** — the largest-polygon extraction in both `merge_fragments` and `to_wgs84_geojson` ensures this
4. **A postcode split across an OA seam keeps all its substantial parts** — `merge_fragments` keeps every part ≥ 100 m² and the output is emitted as a `MultiPolygon` (the Rust server `postcodes.rs` and `loader.py` both parse MultiPolygon); only sub-100 m² noise slivers are dropped
## Module structure

View file

@ -1,12 +1,21 @@
import argparse
import multiprocessing as mp
import os
from pathlib import Path
import numpy as np
import shapely
from shapely.geometry import MultiPolygon, Polygon
from tqdm import tqdm
from .fragments_cache import (
fragments_cache_is_fresh,
load_fragments,
save_fragments,
)
from .inspire import (
build_inspire_index,
cache_inspire,
get_inspire_candidates,
inspire_cache_exists,
load_inspire,
)
@ -14,7 +23,206 @@ from .memory import release_memory
from .oa_boundaries import load_oa_boundaries
from .output import merge_fragments, write_district_geojson
from .process_oa import process_oa
from .uprn import get_oa_uprns, load_uprns
from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
Fragment = tuple[str, Polygon | MultiPolygon]
def _oa_fragments(
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
) -> tuple[list[Fragment], bool]:
"""Process one OA into ``(postcode, geometry)`` fragments.
Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
fast path. Shared by the sequential and parallel drivers so both produce
identical output. Any failure is re-raised tagged with the OA code so a single
bad OA is attributable instead of an anonymous worker abort hours in.
"""
try:
oa_geom = oa_geoms[oa_code]
points, postcodes = get_oa_uprns_arrays(
east, north, postcodes_arr, offsets, oa_code
)
if len(set(postcodes)) == 1:
return [(postcodes[0], oa_geom)], True
candidates = index.candidates(oa_geom.bounds)
return process_oa(oa_geom, points, postcodes, candidates), False
except Exception as exc:
raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
# Worker-shared state. Populated in the parent before the pool forks; children
# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
# never duplicated per worker). Read-only in workers.
_WORKER_STATE: dict = {}
def _process_oa_chunk(oa_codes: list[str]):
"""Worker: turn a chunk of OA codes into WKB-encoded fragments.
Geometries are returned as WKB (compact and lossless) rather than pickled
Shapely objects, to keep the IPC payload small.
"""
state = _WORKER_STATE
frags: list[Fragment] = []
single = 0
for oa_code in oa_codes:
oa_frags, is_single = _oa_fragments(
oa_code,
state["oa_geoms"],
state["east"],
state["north"],
state["postcodes"],
state["offsets"],
state["index"],
)
frags.extend(oa_frags)
single += is_single
if frags:
pcs = [pc for pc, _ in frags]
wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
else:
pcs, wkb = [], np.empty(0, dtype=object)
return pcs, wkb, single, len(oa_codes)
def _resolve_workers(requested: int) -> int:
"""Worker count: the explicit value if >0, otherwise all available CPUs."""
if requested and requested > 0:
return requested
try:
return max(1, len(os.sched_getaffinity(0)))
except AttributeError:
return max(1, os.cpu_count() or 1)
def _process_oas(
oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
) -> tuple[list[Fragment], int]:
"""Drive Phase 3 over every OA, fanning out across `workers` processes.
OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
does not affect the result (``merge_fragments`` unions per postcode), so
chunks are collected as they finish. Returns ``(fragments, single_count)``.
"""
all_fragments: list[Fragment] = []
single_count = 0
if workers <= 1 or "fork" not in mp.get_all_start_methods():
for oa_code in tqdm(
oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
):
oa_frags, is_single = _oa_fragments(
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
)
all_fragments.extend(oa_frags)
single_count += is_single
return all_fragments, single_count
_WORKER_STATE.update(
oa_geoms=oa_geoms,
east=east,
north=north,
postcodes=postcodes_arr,
offsets=offsets,
index=index,
)
# Many small contiguous chunks → dynamic load balancing across workers (rural
# OAs cost far more than urban ones) while preserving mmap read locality.
chunk_size = max(1, len(oa_codes) // (workers * 16))
chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
print(f" Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
ctx = mp.get_context("fork")
try:
with ctx.Pool(processes=workers) as pool:
with tqdm(
total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
) as bar:
for pcs, wkb, single, n_oas in pool.imap_unordered(
_process_oa_chunk, chunks
):
if len(wkb):
all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
single_count += single
bar.update(n_oas)
finally:
# Drop references so Phase 4 doesn't keep the big inputs alive.
_WORKER_STATE.clear()
return all_fragments, single_count
def build_fragments(args: argparse.Namespace) -> list[Fragment]:
"""Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
Returns the full ``(postcode, geometry)`` fragment list. The large
intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
freed as soon as this function returns -- before the fragments are cached or
merged.
"""
# Phase 1: Load all data
print("=" * 60)
print("Phase 1: Loading data")
print("=" * 60)
oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
# Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
# call polars (avoids the fork-after-threads hazard of its rayon pool).
uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
# Phase 2: Parse/load INSPIRE
print()
print("=" * 60)
print("Phase 2: INSPIRE data")
print("=" * 60)
inspire_cache_dir = args.output / "inspire_cache"
if not inspire_cache_exists(inspire_cache_dir):
cache_inspire(args.inspire, inspire_cache_dir)
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
# Phase 3: Process OAs
print()
print("=" * 60)
print("Phase 3: Processing OAs")
print("=" * 60)
# Build work list — precompute which OAs are single vs multi-postcode
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
if args.limit > 0:
oa_codes_with_data = oa_codes_with_data[: args.limit]
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
print(f" Skipped (no boundary): {skipped_no_boundary}")
# --limit is a debug mode → force deterministic single-process.
workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
all_fragments, single_count = _process_oas(
oa_codes_with_data,
oa_geoms,
uprn_east,
uprn_north,
uprn_postcodes,
uprn_offsets,
inspire_index,
workers,
)
multi_count = len(oa_codes_with_data) - single_count
print(f"\n Single-postcode OAs (fast path): {single_count}")
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
print(f" Total fragments: {len(all_fragments)}")
return all_fragments
def main() -> None:
@ -38,6 +246,12 @@ def main() -> None:
parser.add_argument(
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
)
parser.add_argument(
"--workers",
type=int,
default=0,
help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
)
parser.add_argument(
"--greenspace",
type=Path,
@ -46,79 +260,30 @@ def main() -> None:
)
args = parser.parse_args()
# Phase 1: Load all data
print("=" * 60)
print("Phase 1: Loading data")
print("=" * 60)
fragments_cache = args.output / "fragments_cache.parquet"
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
# so a greenspace change must not invalidate the fragment cache.
fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
# --limit yields a partial fragment set; never read or write the shared cache.
use_cache = args.limit == 0
oa_geoms = load_oa_boundaries(args.oa_boundaries)
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
# Phase 2: Parse/load INSPIRE
print()
print("=" * 60)
print("Phase 2: INSPIRE data")
print("=" * 60)
inspire_cache_dir = args.output / "inspire_cache"
if not inspire_cache_exists(inspire_cache_dir):
cache_inspire(args.inspire, inspire_cache_dir)
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
# Phase 3: Process OAs
print()
print("=" * 60)
print("Phase 3: Processing OAs")
print("=" * 60)
# Build work list — precompute which OAs are single vs multi-postcode
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
if args.limit > 0:
oa_codes_with_data = oa_codes_with_data[: args.limit]
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
print(f" Skipped (no boundary): {skipped_no_boundary}")
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
single_count = 0
multi_count = 0
for oa_code in tqdm(
oa_codes_with_data,
desc="Processing OAs",
unit="OA",
smoothing=0.01,
miniters=100,
):
oa_geom = oa_geoms[oa_code]
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
if len(set(postcodes)) == 1:
# Fast path: entire OA = one postcode
all_fragments.append((postcodes[0], oa_geom))
single_count += 1
continue
# Get INSPIRE candidates via bbox pre-filter
candidates = get_inspire_candidates(
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
print("=" * 60)
print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
print("=" * 60)
all_fragments = load_fragments(fragments_cache)
print(
f" Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
)
else:
all_fragments = build_fragments(args)
if use_cache:
# Persist the expensive Phase-3 output before the cheap-but-fragile
# merge/write so any failure there resumes in seconds, not ~10 hours.
save_fragments(fragments_cache, all_fragments)
print(f" Cached {len(all_fragments):,} fragments to {fragments_cache}")
fragments = process_oa(oa_geom, points, postcodes, candidates)
all_fragments.extend(fragments)
multi_count += 1
print(f"\n Single-postcode OAs (fast path): {single_count}")
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
print(f" Total fragments: {len(all_fragments)}")
# Free data no longer needed
del oa_geoms, uprn_df, uprn_offsets
del inspire_bboxes, inspire_offsets, inspire_coords
# Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
release_memory()
# Phase 4: Merge and write
@ -145,6 +310,12 @@ def main() -> None:
file_count = write_district_geojson(merged, args.output)
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
# The cache exists only to survive a crash between Phase 3 and a clean write.
# Now that the output is complete, drop it so a later input change can never
# be served from a stale cache.
if use_cache:
fragments_cache.unlink(missing_ok=True)
print("Done!")

View file

@ -112,44 +112,130 @@ def load_inspire(
return bboxes, offsets, coords_mmap
def get_inspire_candidates(
oa_bounds: tuple[float, float, float, float],
# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
# and the 99th percentile ~540 m, so almost every parcel fits inside a single
# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
# query.
_GRID_CELL_SIZE = 1000.0
class InspireIndex:
"""Uniform-grid spatial index over INSPIRE parcel bounding boxes.
The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
OA, ~4 h total over the country). This indexes parcels by grid cell so each
lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
by cell id, located with ``searchsorted``); the few parcels larger than a
cell are kept in an overflow array tested directly on every query. An exact
bbox test then runs on the gathered subset and the result is sorted, so the
candidate set -- and its order -- is byte-for-byte identical to the old scan.
"""
def __init__(
self,
bboxes: np.ndarray,
offsets: np.ndarray,
coords_mmap: np.memmap,
cell_size: float = _GRID_CELL_SIZE,
) -> None:
self._bboxes = bboxes
self._offsets = offsets
self._coords = coords_mmap
self._cell_size = cell_size
self._origin_x = float(bboxes[:, 0].min())
self._origin_y = float(bboxes[:, 1].min())
# Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
# query's one-cell low-edge widening can never collide with cx-1.
self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2
width = bboxes[:, 2] - bboxes[:, 0]
height = bboxes[:, 3] - bboxes[:, 1]
small = np.where((width <= cell_size) & (height <= cell_size))[0]
self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
self._oversized_bb = bboxes[self._oversized]
cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
cell_id = cx * self._ny + cy
order = np.argsort(cell_id, kind="stable")
self._sorted_cells = cell_id[order]
self._cell_parcels = small[order]
def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
"""Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
min_e, min_n, max_e, max_n = oa_bounds
cs = self._cell_size
# A small parcel (<= one cell) overlapping the OA has its min-corner no
# more than one cell below/left of the OA bbox, so widen the low edges by
# a cell. This keeps the lookup free of false negatives.
gx0 = int((min_e - cs - self._origin_x) // cs)
gx1 = int((max_e - self._origin_x) // cs)
gy_lo = int((min_n - cs - self._origin_y) // cs)
gy_hi = int((max_n - self._origin_y) // cs)
parts = []
ob = self._oversized_bb
if len(ob):
mo = (
(ob[:, 2] >= min_e)
& (ob[:, 0] <= max_e)
& (ob[:, 3] >= min_n)
& (ob[:, 1] <= max_n)
)
if mo.any():
parts.append(self._oversized[mo])
for gx in range(gx0, gx1 + 1):
base = gx * self._ny
lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
if hi > lo:
parts.append(self._cell_parcels[lo:hi])
if not parts:
return np.empty(0, dtype=np.int64)
cand = np.concatenate(parts)
cb = self._bboxes[cand]
mask = (
(cb[:, 2] >= min_e)
& (cb[:, 0] <= max_e)
& (cb[:, 3] >= min_n)
& (cb[:, 1] <= max_n)
)
# Sort so the candidate order matches the old full np.where scan exactly.
return np.sort(cand[mask])
def candidates(
self, oa_bounds: tuple[float, float, float, float]
) -> list[Polygon]:
"""INSPIRE polygons overlapping an OA, built from the mmap on demand.
Builds Shapely objects only for matches (typically 10-500 per OA).
"""
candidates = []
for i in self.candidate_indices(oa_bounds):
byte_offset = self._offsets[i, 0]
n_pts = self._offsets[i, 1]
float_offset = byte_offset // 8 # float64 = 8 bytes
coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
poly = Polygon(coords)
if not poly.is_valid:
poly = make_valid(poly)
if poly.geom_type == "MultiPolygon":
poly = max(poly.geoms, key=lambda g: g.area)
elif poly.geom_type != "Polygon":
continue
if not poly.is_empty:
candidates.append(poly)
return candidates
def build_inspire_index(
bboxes: np.ndarray,
offsets: np.ndarray,
coords_mmap: np.memmap,
) -> list[Polygon]:
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
Builds Shapely objects only for matches (typically 10-500 per OA).
Reads coordinate data on-demand from memory-mapped file.
"""
min_e, min_n, max_e, max_n = oa_bounds
# Vectorized bbox overlap test
mask = (
(bboxes[:, 2] >= min_e)
& (bboxes[:, 0] <= max_e)
& (bboxes[:, 3] >= min_n)
& (bboxes[:, 1] <= max_n)
)
idxs = np.where(mask)[0]
if len(idxs) == 0:
return []
# Build Shapely polygons only for candidates (coords from mmap)
candidates = []
for i in idxs:
byte_offset = offsets[i, 0]
n_pts = offsets[i, 1]
float_offset = byte_offset // 8 # float64 = 8 bytes
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
poly = Polygon(coords)
if not poly.is_valid:
poly = make_valid(poly)
if poly.geom_type == "MultiPolygon":
poly = max(poly.geoms, key=lambda g: g.area)
elif poly.geom_type != "Polygon":
continue
if not poly.is_empty:
candidates.append(poly)
return candidates
cell_size: float = _GRID_CELL_SIZE,
) -> InspireIndex:
"""Build the grid spatial index used for per-OA candidate retrieval."""
return InspireIndex(bboxes, offsets, coords_mmap, cell_size)

View file

@ -3,8 +3,9 @@ import shutil
from collections import defaultdict
from pathlib import Path
import numpy as np
from pyproj import Transformer
from shapely import make_valid, set_precision
from shapely import STRtree, make_valid, set_precision
from shapely.errors import GEOSException
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
from shapely.ops import transform as transform_geometry
@ -41,30 +42,30 @@ def _largest_polygonal(geom) -> Polygon | None:
return None
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
# below this in any direction snap to empty during serialization.
_OUTPUT_PRECISION_DEG = 0.000001
# Minimal BNG buffer used to rescue sub-grid slivers into a representable
# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
# vertices) would otherwise vanish at output precision; since every *active*
# postcode must keep a boundary (validate_outputs enforces this with zero
# tolerance), we fatten it just enough to survive snapping rather than drop it.
_MIN_FOOTPRINT_BUFFER_M = 0.5
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
just the intermediate Shapely object: coordinate snapping during
serialization can otherwise leave a self-intersecting ring that only shows up
once the feature is read back from disk. Any such geometry is repaired with
``make_valid`` before returning so written features are always valid.
once the feature is read back from disk. Returns ``None`` if the geometry
collapses to empty (a sub-grid sliver).
"""
geom = _largest_polygonal(geom)
if geom is None:
return None
simplified = geom.simplify(tolerance, preserve_topology=True)
simplified = _largest_polygonal(simplified)
if simplified is None:
return None
transformer = _get_to_wgs84()
wgs84 = transform_geometry(transformer.transform, simplified)
wgs84 = transform_geometry(transformer.transform, geom_bng)
try:
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
except GEOSException:
# Precision snapping can fail on pathological geometries; fall back to a
# plain validity repair without coordinate snapping.
@ -87,20 +88,105 @@ def to_wgs84_geojson(
return geojson_dict
def _rescue_footprint(geom_bng) -> dict | None:
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
if footprint is None:
return None
return _snap_to_wgs84_geojson(footprint)
def to_wgs84_geojson(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
output precision. Dropping them would leave an active postcode with no
boundary (validate_outputs rejects that with zero tolerance), so instead they
are fattened into a minimal footprint at the right location: first by buffering
the (often elongated) sliver itself, then -- for fully-degenerate input -- a
small disc around ``representative_point()``, which lies inside any non-empty
geometry. ``None`` is returned only for a genuinely empty input.
"""
if geom is None or geom.is_empty:
return None
cleaned = _largest_polygonal(geom)
if cleaned is not None:
simplified = _largest_polygonal(
cleaned.simplify(tolerance, preserve_topology=True)
)
if simplified is None:
simplified = cleaned
# Normal path; if snapping erases a thin sliver, fatten its real shape.
result = _snap_to_wgs84_geojson(simplified)
if result is None:
result = _rescue_footprint(simplified)
if result is not None:
return result
# Universal fallback for input too degenerate to clean or fatten in place.
return _rescue_footprint(geom.representative_point())
def to_wgs84_geojson_multi(
geom: Polygon | MultiPolygon, tolerance: float = 1.0
) -> dict | None:
"""Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
preserving every part. Each part is simplified/snapped/rescued independently
via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
"""
parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
if not part_dicts:
return None
if len(part_dicts) == 1:
return part_dicts[0]
return {
"type": "MultiPolygon",
"coordinates": [pd["coordinates"] for pd in part_dicts],
}
# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
# get filled. A hole at least this large is likely a genuinely enclosed postcode
# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
# guarantee, this is defence-in-depth.
_MAX_ARTIFACT_HOLE_AREA = 1000.0
def _fill_small_holes(poly: Polygon) -> Polygon:
kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
return Polygon(poly.exterior, kept)
def _fill_holes(geom):
"""Remove all interior rings (holes) from a polygon or multipolygon."""
"""Fill small artifact interior rings; keep large (real-enclosed) holes."""
if geom.geom_type == "Polygon":
return Polygon(geom.exterior)
return _fill_small_holes(geom)
elif geom.geom_type == "MultiPolygon":
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
return geom
def _largest_polygon(geom):
"""Extract the largest polygon from a MultiPolygon."""
if geom.geom_type == "MultiPolygon":
return max(geom.geoms, key=lambda g: g.area)
return geom
# A postcode genuinely split across an OA seam (by a railway, river, or main road
# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
# smaller detached bits are Voronoi/clipping noise and are still dropped.
_MIN_DETACHED_PART_AREA = 100.0
def _keep_polygon_parts(geom):
"""Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
if geom.geom_type != "MultiPolygon":
return geom
parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
if not parts:
parts = [max(geom.geoms, key=lambda g: g.area)]
return parts[0] if len(parts) == 1 else MultiPolygon(parts)
def merge_fragments(
@ -126,14 +212,19 @@ def merge_fragments(
continue
if not combined.is_valid:
combined = make_valid(combined)
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
# Close tiny gaps between adjacent OA boundary edges (float mismatches).
# The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
# sliver fragment) to nothing, which would leave the postcode with no
# geometry at all — keep the un-closed shape if that happens.
if combined.geom_type == "MultiPolygon":
combined = combined.buffer(5.0).buffer(-5.0)
if not combined.is_valid:
combined = make_valid(combined)
# Postcodes are contiguous delivery routes — keep only the largest
# polygon; small detached fragments are algorithm artifacts
combined = _largest_polygon(combined)
closed = combined.buffer(5.0).buffer(-5.0)
if not closed.is_valid:
closed = make_valid(closed)
if not closed.is_empty:
combined = closed
# Keep the postcode whole: the largest part plus any other substantial
# part (a genuine railway/river split), dropping only tiny noise slivers.
combined = _keep_polygon_parts(combined)
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
combined = _fill_holes(combined)
# Subtract parks/water if provided
@ -142,7 +233,7 @@ def merge_fragments(
pre_green = combined
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
combined = _largest_polygon(combined)
combined = _keep_polygon_parts(combined)
# Do NOT _fill_holes here: interior holes carved by the greenspace
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
# Filling them would re-add the removed area and negate the
@ -155,10 +246,114 @@ def merge_fragments(
return merged
def _polygonal(geom):
"""Return only the polygonal part(s) of a geometry, or None if none remain."""
if geom is None or geom.is_empty:
return None
if geom.geom_type in ("Polygon", "MultiPolygon"):
return geom
if geom.geom_type == "GeometryCollection":
polys = [
g
for g in geom.geoms
if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
]
if not polys:
return None
merged = unary_union(polys)
return merged if not merged.is_empty else None
return None
def _resolve_overlaps(
items: list[tuple[str, Polygon | MultiPolygon]],
) -> list[tuple[str, Polygon | MultiPolygon]]:
"""Make the postcode polygons a partition: no two cover the same ground.
Overlap appears at OA seams (the 5m merge buffer expands each postcode
independently), from simplifying each postcode on its own, and as genuine
containment (a postcode fully enclosed by another). Each postcode is trimmed
by the union of its higher-priority overlapping neighbours, where **priority =
ascending area**: a smaller postcode wins contested ground. That single rule
handles both cases correctly an enclosed postcode is always smaller than its
container, so it keeps its area while the container gets a hole (a `overlaps`
query alone would miss containment entirely). Run last, on the final output
geometries, so nothing re-introduces overlap afterwards. A postcode that would
be emptied keeps its original geometry, so an active postcode is never dropped.
"""
geoms = [g for _, g in items]
n = len(geoms)
if n < 2:
return items
# rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
# for determinism.
rank = {
idx: r
for r, idx in enumerate(
sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
)
}
tree = STRtree(geoms)
arr = np.array(geoms, dtype=object)
pairs: set[tuple[int, int]] = set()
# "overlaps" gives partial overlaps; "contains" gives containment (which
# "overlaps" excludes) — together they cover every 2-D overlap without the
# edge-touch explosion a plain "intersects" query would add.
for predicate in ("overlaps", "contains"):
qsrc, qtgt = tree.query(arr, predicate=predicate)
for s, t in zip(qsrc.tolist(), qtgt.tolist()):
if s != t:
pairs.add((s, t) if s < t else (t, s))
# For each loser (lower priority) the higher-priority neighbours to subtract.
higher: dict[int, list[int]] = defaultdict(list)
for a, b in pairs:
winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
higher[loser].append(winner)
out = list(geoms)
# Process losers from highest priority down, so every subtracted neighbour is
# already finalised.
for i in sorted(higher, key=lambda idx: rank[idx]):
cut = unary_union([out[j] for j in higher[i]])
trimmed = out[i].difference(cut)
if not trimmed.is_valid:
trimmed = make_valid(trimmed)
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an
# area threshold here would wrongly drop everything but the largest part
# and re-open the very gaps the seam fix closed.
trimmed = _polygonal(trimmed)
if trimmed is not None and not trimmed.is_empty:
out[i] = trimmed
return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
def _round_coords(coords, ndigits=6):
if coords and isinstance(coords[0], (int, float)):
return [round(coords[0], ndigits), round(coords[1], ndigits)]
return [_round_coords(c, ndigits) for c in coords]
def _geojson_geometry(geom) -> dict | None:
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
geom = _polygonal(geom if geom.is_valid else make_valid(geom))
if geom is None or geom.is_empty:
return None
gj = mapping(geom)
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
def write_district_geojson(
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
) -> int:
"""Group postcodes by district, write GeoJSON files. Returns file count."""
"""Group postcodes by district, write GeoJSON files. Returns file count.
Before writing, the postcode polygons are converted to their final WGS84 form
and made a partition (overlaps removed) so the output never has two postcodes
covering the same ground.
"""
units_dir = output_dir / "units"
tmp_units_dir = output_dir / "units.tmp"
output_dir.mkdir(parents=True, exist_ok=True)
@ -166,38 +361,46 @@ def write_district_geojson(
shutil.rmtree(tmp_units_dir)
tmp_units_dir.mkdir(parents=True)
skipped: list[str] = []
# Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
# sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
# priority. to_wgs84_geojson_multi returns None only for a genuinely empty
# input, which is skipped and reported rather than aborting a multi-hour run.
converted: list[tuple[str, Polygon | MultiPolygon]] = []
for pc in sorted(postcodes):
gj = to_wgs84_geojson_multi(postcodes[pc])
if gj is None:
skipped.append(pc)
continue
converted.append((pc, shape(gj)))
# Remove overlap strips so the output is a clean partition.
converted = _resolve_overlaps(converted)
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
for pc, geom in postcodes.items():
for pc, geom in converted:
parts = pc.split()
district = parts[0] if parts else pc[:4]
by_district[district].append((pc, geom))
file_count = 0
seen_postcodes: set[str] = set()
for district, entries in tqdm(
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
):
features = []
for pc, geom in sorted(entries, key=lambda x: x[0]):
if pc in seen_postcodes:
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
seen_postcodes.add(pc)
geojson_geom = to_wgs84_geojson(geom)
geojson_geom = _geojson_geometry(geom)
if geojson_geom is None:
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
written_geom = shape(geojson_geom)
if written_geom.is_empty or not written_geom.is_valid:
raise ValueError(
f"Invalid postcode boundary geometry after output: {pc}"
)
mapit_code = pc.replace(" ", "")
skipped.append(pc)
continue
features.append(
{
"type": "Feature",
"geometry": geojson_geom,
"properties": {
"postcodes": pc,
"mapit_code": mapit_code,
"mapit_code": pc.replace(" ", ""),
},
}
)
@ -211,6 +414,14 @@ def write_district_geojson(
json.dump(collection, f, separators=(",", ":"))
file_count += 1
if skipped:
preview = ", ".join(skipped[:10])
suffix = "" if len(skipped) > 10 else ""
print(
f" Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
f"geometry: {preview}{suffix}"
)
if units_dir.exists():
shutil.rmtree(units_dir)
tmp_units_dir.replace(units_dir)

View file

@ -85,19 +85,42 @@ def _claim_inspire_parcels(
uprn_pts = shp_points(points)
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")
# First priority: parcels that physically contain UPRNs. Majority vote
# resolves blocks of flats or overlapping parcel data.
# First priority: parcels that physically contain UPRNs. A parcel holding
# UPRNs from a single postcode goes wholly to that postcode. A parcel shared
# by several postcodes (a block of flats spanning postcodes, or overlapping
# parcel data) is split between them via a sub-Voronoi over their own UPRNs
# clipped to the parcel — so EVERY contained postcode keeps part of the
# parcel. A bare majority vote would hand the whole parcel to one winner and
# leave the losers' UPRNs trapped inside claimed land, dropping them from
# both this claim and the `remaining` polygon handed to Voronoi downstream.
cand_postcodes: dict[int, list[str]] = defaultdict(list)
cand_point_idx: dict[int, list[int]] = defaultdict(list)
for pi, ci in zip(pt_idx, cand_idx):
cand_postcodes[ci].append(postcodes[pi])
cand_point_idx[ci].append(pi)
points_f64 = points.astype(np.float64, copy=False)
contained_parts: dict[str, list] = defaultdict(list)
contained_scores: Counter[str] = Counter()
for ci, pc_list in cand_postcodes.items():
pc_counts = Counter(pc_list)
winner, votes = pc_counts.most_common(1)[0]
contained_parts[winner].append(parcels[ci])
contained_scores[winner] += votes
if len(pc_counts) == 1:
winner = next(iter(pc_counts))
contained_parts[winner].append(parcels[ci])
contained_scores[winner] += pc_counts[winner]
continue
# Shared parcel: sub-Voronoi over the contained UPRNs so each postcode
# present keeps a fragment instead of being absorbed by the winner.
sub_idx = cand_point_idx[ci]
sub_points = points_f64[sub_idx]
sub_postcodes = [postcodes[pi] for pi in sub_idx]
for pc, geom in compute_voronoi_regions(
sub_points, sub_postcodes, parcels[ci]
).items():
cleaned = _clean_polygonal(geom)
if cleaned is not None:
contained_parts[pc].append(cleaned)
contained_scores[pc] += pc_counts[pc]
contained_claimed = _merge_parts_by_postcode(contained_parts)
contained_claims = sorted(
@ -109,7 +132,6 @@ def _claim_inspire_parcels(
# each to the nearest UPRN/postcode so parcel boundaries carry more of the
# visible postcode shape; Voronoi is then limited to roads, parks, water, and
# any other non-parcel gaps.
points_f64 = points.astype(np.float64, copy=False)
contained_union = _union_claims(contained_claims)
nearest_tree = cKDTree(points_f64)
nearest_parts: dict[str, list] = defaultdict(list)
@ -235,11 +257,11 @@ def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
return None
if len(polys) == 1:
return polys[0]
return MultiPolygon(
[
p
for g in polys
for p in (g.geoms if g.geom_type == "MultiPolygon" else [g])
]
)
# Union (not bare MultiPolygon construction): make_valid can emit
# overlapping polygonal parts, and a MultiPolygon of overlapping parts is
# invalid — it double-counts area and makes the next `.difference()` raise
# a TopologyException that aborts the OA (and, in parallel mode, the
# worker). unary_union merges them into a valid geometry.
merged = unary_union(polys)
return merged if not merged.is_empty else None
return None

View file

@ -11,12 +11,20 @@ import pytest
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import unary_union
from .fragments_cache import (
fragments_cache_is_fresh,
load_fragments,
save_fragments,
)
from .__main__ import _oa_fragments, _process_oas
from .inspire import build_inspire_index
from .oa_boundaries import parse_gpkg_geometry
from .greenspace import subtract_greenspace
from .output import (
_fill_holes,
merge_fragments,
to_wgs84_geojson,
to_wgs84_geojson_multi,
write_district_geojson,
)
from .process_oa import _extract_polygonal, process_oa
@ -173,6 +181,52 @@ class TestWhitespacePostcodes:
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
def test_remapped_terminated_postcode_adopts_successor_oa(self, tmp_path):
"""When a terminated postcode is remapped to its active successor, the
remapped seed point must carry the SUCCESSOR's OA (and coords), not the
terminated postcode's original OA. Pre-fix the row kept OA21CD of the
terminated postcode, seeding the successor into an OA it doesn't belong
to and splitting its boundary across OAs."""
# Terminated AA1 1AA sits in OA E00000001. Its nearest active successor
# AA1 1AB lives in a DIFFERENT OA (E00000002) far away.
uprns = pl.DataFrame(
{
"GRIDGB1E": [500010],
"GRIDGB1N": [180010],
"PCDS": ["AA1 1AA"],
"OA21CD": ["E00000001"],
}
)
uprn_path = tmp_path / "uprn.parquet"
uprns.write_parquet(uprn_path)
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"east1m": [500010, 500030],
"north1m": [180010, 180020],
# AA1 1AA terminated → only AA1 1AB is an active successor, and
# it belongs to a different OA than the terminated postcode.
"oa21cd": ["E00000001", "E00000002"],
"doterm": ["2020-01-01", None],
"ctry25cd": ["E92000001", "E92000001"],
}
)
arcgis_path = tmp_path / "arcgis.parquet"
arcgis.write_parquet(arcgis_path)
loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
# The remapped point must be grouped under the successor's OA, not the
# terminated postcode's OA.
assert "E00000002" in offsets, "Successor OA missing — remap kept old OA"
assert "E00000001" not in offsets, (
"Remapped point still lives in the terminated postcode's OA"
)
points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
assert postcodes == ["AA1 1AB"]
# It should also adopt the successor's authoritative coordinates.
assert points.tolist() == [[500030.0, 180020.0]]
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
uprns = pl.DataFrame(
{
@ -617,6 +671,32 @@ class TestProcessOAInspireParcelAssignment:
for _, geom in fragments:
assert geom.difference(oa_geom).area < 0.01
def test_shared_parcel_keeps_every_contained_postcode(self):
"""A single parcel containing UPRNs for [A, A, B] must yield a fragment
for BOTH A and B. Pre-fix the majority winner (A) claimed the whole
parcel, excluding it from `remaining`, so B's UPRNs were trapped inside
claimed land and B vanished entirely (no fragment)."""
oa_geom = box(0, 0, 100, 100)
parcel = box(0, 0, 100, 100) # one parcel covering the whole OA
points = np.array(
[
[20, 50], # postcode A
[30, 50], # postcode A (majority)
[80, 50], # postcode B (minority — would be dropped pre-fix)
]
)
postcodes = ["A", "A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
frag_dict = dict(fragments)
assert "A" in frag_dict, "Majority postcode A must keep a fragment"
assert "B" in frag_dict, "Minority postcode B must not be dropped"
assert frag_dict["A"].area > 0
assert frag_dict["B"].area > 0
# The split must partition the parcel without overlap.
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
# ---------------------------------------------------------------------------
# _extract_polygonal helper
@ -656,6 +736,21 @@ class TestExtractPolygonal:
assert _extract_polygonal(LineString([(0, 0), (1, 1)])) is None
def test_overlapping_collection_unioned_to_valid(self):
"""A GeometryCollection with OVERLAPPING polygons must be unioned into a
VALID geometry (not a raw MultiPolygon, which would be invalid and crash
the next .difference()), and must not double-count the overlap area."""
from shapely.geometry import GeometryCollection
a = box(0, 0, 100, 100)
b = box(50, 50, 150, 150) # overlaps a by 50x50
result = _extract_polygonal(GeometryCollection([a, b]))
assert result is not None
assert result.is_valid
assert result.area == pytest.approx(unary_union([a, b]).area)
# And the formerly-crashing op now works:
assert result.difference(box(0, 0, 10, 10)).is_valid
# ---------------------------------------------------------------------------
# Edge case: merge_fragments handles single-OA postcodes
@ -763,12 +858,12 @@ class TestParseGpkgGeometry:
class TestFillHoles:
"""_fill_holes must remove all interior holes from polygons."""
"""_fill_holes fills small artifact holes but keeps large (real-enclosed) ones."""
def test_polygon_with_hole(self):
"""A polygon with an interior ring should become a solid polygon."""
def test_small_artifact_hole_filled(self):
"""A small (<1000 m²) interior ring is an artifact and gets filled."""
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
hole = [(40, 40), (60, 40), (60, 60), (40, 60), (40, 40)] # 20x20 = 400 m²
poly_with_hole = Polygon(outer, [hole])
assert len(list(poly_with_hole.interiors)) == 1
result = _fill_holes(poly_with_hole)
@ -776,6 +871,15 @@ class TestFillHoles:
assert len(list(result.interiors)) == 0
assert result.area == pytest.approx(Polygon(outer).area)
def test_large_hole_kept(self):
"""A large (>=1000 m²) hole is likely a real enclosed postcode — keep it."""
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
hole = [(20, 20), (80, 20), (80, 80), (20, 80), (20, 20)] # 60x60 = 3600 m²
poly_with_hole = Polygon(outer, [hole])
result = _fill_holes(poly_with_hole)
assert len(list(result.interiors)) == 1
assert result.area == pytest.approx(10000 - 3600)
def test_multipolygon_with_holes(self):
"""A MultiPolygon where each part has holes should have all holes removed."""
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
@ -944,3 +1048,356 @@ class TestGreenspaceHolePreserved:
merged = result["TEST1"]
assert len(list(merged.interiors)) == 1
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
# ---------------------------------------------------------------------------
# merge_fragments keeps substantial detached parts (no OA-seam coverage gaps)
# ---------------------------------------------------------------------------
class TestKeepDetachedParts:
"""A postcode split across an OA seam (railway/river) must keep both parts
instead of dropping all but the largest, which left ~1.8% uncovered gaps."""
def test_far_apart_parts_both_kept(self):
# Two 50x50m blocks 30m apart — wider than the 10m merge buffer.
a = box(0, 0, 50, 50) # 2500 m²
b = box(80, 0, 130, 50) # 2500 m², 30m gap
geom = merge_fragments([("AA1 1AA", a), ("AA1 1AA", b)])["AA1 1AA"]
assert geom.geom_type == "MultiPolygon"
assert len(geom.geoms) == 2
assert geom.area == pytest.approx(5000, rel=0.01)
def test_tiny_noise_part_dropped(self):
main = box(0, 0, 100, 100) # 10000 m²
noise = box(200, 200, 205, 205) # 25 m² < 100 m² threshold
geom = merge_fragments([("AA1 1AA", main), ("AA1 1AA", noise)])["AA1 1AA"]
assert geom.geom_type == "Polygon"
assert geom.area == pytest.approx(10000, rel=0.01)
class TestMultiPolygonOutput:
"""to_wgs84_geojson_multi / the writer must emit MultiPolygon for split
postcodes (the Rust server + loader already parse MultiPolygon)."""
def test_multipolygon_preserves_all_parts(self):
from shapely.geometry import shape
mp = MultiPolygon(
[
box(530000, 180000, 530100, 180100),
box(531000, 180000, 531100, 180100),
]
)
gj = to_wgs84_geojson_multi(mp)
assert gj["type"] == "MultiPolygon"
assert len(gj["coordinates"]) == 2
rt = shape(gj)
assert rt.is_valid and not rt.is_empty
assert len(rt.geoms) == 2
def test_single_part_stays_polygon(self):
gj = to_wgs84_geojson_multi(box(530000, 180000, 530100, 180100))
assert gj["type"] == "Polygon"
def test_writer_emits_multipolygon_feature(self, tmp_path):
mp = MultiPolygon(
[
box(530000, 180000, 530100, 180100),
box(531000, 180000, 531100, 180100),
]
)
assert write_district_geojson({"AA1 1AA": mp}, tmp_path) == 1
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
assert coll["features"][0]["geometry"]["type"] == "MultiPolygon"
class TestOutputPartition:
"""The writer must emit a partition: overlapping postcodes are made disjoint
(no two cover the same ground) without dropping an active postcode."""
def test_overlapping_postcodes_made_disjoint(self, tmp_path):
from shapely.geometry import shape
a = box(530000, 180000, 530100, 180100)
b = box(530090, 180000, 530200, 180100) # overlaps `a` in a 10m strip
assert a.intersection(b).area > 0 # precondition: they overlap
write_district_geojson({"AA1 1AA": a, "AA1 1AB": b}, tmp_path)
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
geoms = {
f["properties"]["postcodes"]: shape(f["geometry"])
for f in coll["features"]
}
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
# Disjoint interiors (share at most an edge).
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
0.0, abs=1e-12
)
assert all(g.area > 0 for g in geoms.values())
def test_enclosed_postcode_makes_container_a_donut(self, tmp_path):
"""A postcode fully INSIDE another must stay disjoint: the smaller (inner)
keeps its area, the container gets a hole. A plain `overlaps` query misses
containment, so this is the regression guard for that fix."""
from shapely.geometry import shape
outer = box(530000, 180000, 530300, 180300) # 90,000 m²
inner = box(530100, 180100, 530200, 180200) # 10,000 m², fully inside outer
assert outer.contains(inner) # precondition
write_district_geojson({"AA1 1AA": outer, "AA1 1AB": inner}, tmp_path)
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
geoms = {
f["properties"]["postcodes"]: shape(f["geometry"])
for f in coll["features"]
}
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
0.0, abs=1e-12
)
# Container is now a donut around the enclosed postcode.
assert geoms["AA1 1AA"].geom_type == "Polygon"
assert len(list(geoms["AA1 1AA"].interiors)) == 1
assert geoms["AA1 1AB"].area > 0
# ---------------------------------------------------------------------------
# InspireIndex must return the same candidates as a brute-force bbox scan
# ---------------------------------------------------------------------------
class TestInspireIndex:
"""The grid index replaces a per-OA linear scan of all parcel bboxes; it must
return an identical candidate set (and order) so Phase 3 output is unchanged."""
@staticmethod
def _brute(bboxes, box):
e0, n0, e1, n1 = box
mask = (
(bboxes[:, 2] >= e0)
& (bboxes[:, 0] <= e1)
& (bboxes[:, 3] >= n0)
& (bboxes[:, 1] <= n1)
)
return np.where(mask)[0]
def test_matches_brute_force_over_random_queries(self):
rng = np.random.default_rng(0)
x = rng.uniform(0, 10000, 5000)
y = rng.uniform(0, 10000, 5000)
w = rng.uniform(1, 60, 5000) # all <= 500m cell → CSR path
h = rng.uniform(1, 60, 5000)
bboxes = np.column_stack([x, y, x + w, y + h]).astype(np.float64)
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
for _ in range(400):
cx, cy = rng.uniform(0, 10000), rng.uniform(0, 10000)
sz = float(rng.choice([30.0, 200.0, 1000.0, 3000.0]))
box = (cx, cy, cx + sz, cy + sz)
got = idx.candidate_indices(box)
expected = np.sort(self._brute(bboxes, box))
assert np.array_equal(got, expected)
def test_oversized_parcel_is_found(self):
# A parcel larger than a cell goes to the overflow list, not the grid;
# a query deep inside it (away from the small parcels) must still find it.
bboxes = np.array(
[
[0.0, 0.0, 5000.0, 5000.0], # 5km parcel >> 500m cell
[100.0, 100.0, 120.0, 120.0],
[4000.0, 4000.0, 4020.0, 4020.0],
]
)
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
box = (2000.0, 2000.0, 2050.0, 2050.0)
got = idx.candidate_indices(box)
assert 0 in got
assert np.array_equal(got, np.sort(self._brute(bboxes, box)))
def test_no_overlap_returns_empty(self):
bboxes = np.array([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]])
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
assert len(idx.candidate_indices((100.0, 100.0, 110.0, 110.0))) == 0
# ---------------------------------------------------------------------------
# Parallel OA processing must match the sequential result exactly
# ---------------------------------------------------------------------------
class TestParallelProcessing:
"""_process_oas across workers must produce the same fragments as workers=1.
Uses single-postcode OAs (fast path), so it exercises the chunking + WKB
round-trip + fork machinery without needing INSPIRE data."""
@staticmethod
def _inputs(n_oas=60):
import pyarrow as pa
oa_geoms = {
f"E{i:08d}": box(i * 100.0, 0.0, i * 100.0 + 50.0, 50.0)
for i in range(n_oas)
}
codes = sorted(oa_geoms)
east, north, pcs = [], [], []
offsets = {}
pos = 0
for i, code in enumerate(codes):
east += [i * 100.0 + 10.0, i * 100.0 + 20.0]
north += [10.0, 20.0]
pcs += [f"AA{i % 5} {i % 9}AA"] * 2 # one postcode per OA → fast path
offsets[code] = (pos, pos + 2)
pos += 2
return (
codes,
oa_geoms,
np.array(east),
np.array(north),
pa.array(pcs, type=pa.large_string()),
offsets,
)
@staticmethod
def _norm(frags):
return sorted((pc, geom.wkb_hex) for pc, geom in frags)
def test_parallel_matches_sequential(self):
codes, oa, east, north, pcs, offs = self._inputs()
seq, s1 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=1)
par, s2 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=3)
assert len(seq) == len(codes) # one fragment per single-postcode OA
assert s1 == s2 == len(codes)
assert self._norm(seq) == self._norm(par)
def test_oa_failure_is_tagged_with_oa_code(self):
"""A failure inside per-OA processing must re-raise with the OA code, so a
single bad OA is attributable instead of an anonymous worker abort."""
# Missing OA in the geoms dict → KeyError, wrapped with the OA code.
with pytest.raises(RuntimeError, match="E00099999"):
_oa_fragments("E00099999", {}, None, None, None, {}, None)
class TestDegenerateGeometryHandling:
"""Every active postcode must keep a boundary (validate_outputs is strict),
so a sub-grid sliver is fattened rather than dropped. A genuinely empty
geometry is skipped without aborting the whole write (the 10h regression)."""
# Three near-collinear vertices in BNG: bbox ~28m x 7m but area ~0.04 m²,
# i.e. AL10 0TU. Without the rescue it snaps to empty at output precision.
SLIVER = Polygon(
[(523045.34, 209625.56), (523040.47, 209624.33), (523017.0, 209618.42)]
)
def test_sliver_is_rescued_to_valid_geometry(self):
from shapely.geometry import shape
result = to_wgs84_geojson(self.SLIVER)
assert result is not None, "sliver must be rescued, not dropped"
rt = shape(result)
assert not rt.is_empty
assert rt.is_valid
def test_collinear_zero_area_input_is_rescued(self):
"""A zero-area collinear 'polygon' (can't be cleaned to a polygon) must
still be rescued via the representative-point fallback, not dropped."""
from shapely.geometry import shape
degenerate = Polygon(
[(523000, 209600), (523010, 209600), (523020, 209600), (523000, 209600)]
)
assert degenerate.area == 0.0
result = to_wgs84_geojson(degenerate)
assert result is not None, "degenerate input must be rescued, not dropped"
rt = shape(result)
assert not rt.is_empty
assert rt.is_valid
def test_sliver_postcode_present_in_output(self, tmp_path):
postcodes = {
"AA1 1AA": box(530000, 180000, 530100, 180100),
"AA1 1AB": self.SLIVER, # must survive
}
file_count = write_district_geojson(postcodes, tmp_path)
assert file_count == 1
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
written = {f["properties"]["postcodes"] for f in collection["features"]}
assert written == {"AA1 1AA", "AA1 1AB"}
def test_empty_geometry_skipped_not_raised(self, tmp_path):
# The last-resort safety net: an unrescuable (empty) geometry is skipped
# so one bad postcode can never abort a multi-hour run.
postcodes = {
"AA1 1AA": box(530000, 180000, 530100, 180100),
"AA1 1AB": Polygon(), # genuinely empty
}
file_count = write_district_geojson(postcodes, tmp_path)
assert file_count == 1
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
written = {f["properties"]["postcodes"] for f in collection["features"]}
assert written == {"AA1 1AA"}
# ---------------------------------------------------------------------------
# fragments_cache round-trips Phase 3 output and validates freshness
# ---------------------------------------------------------------------------
class TestFragmentsCache:
"""Persisting Phase 3 lets a crashed run resume without the ~10h OA loop."""
def test_round_trip_preserves_postcodes_and_geometry(self, tmp_path):
fragments = [
("AA1 1AA", box(0, 0, 100, 100)),
("AA1 1AB", box(200, 200, 250, 260)),
# A postcode spanning multiple OAs appears as repeated entries.
("AA1 1AA", box(100, 0, 150, 100)),
("AA1 1AC", MultiPolygon([box(0, 0, 10, 10), box(20, 20, 30, 30)])),
]
cache = tmp_path / "fragments_cache.parquet"
save_fragments(cache, fragments)
loaded = load_fragments(cache)
assert [pc for pc, _ in loaded] == [pc for pc, _ in fragments]
for (_, original), (_, restored) in zip(fragments, loaded):
assert restored.equals(original)
def test_save_is_atomic_no_tmp_left_behind(self, tmp_path):
cache = tmp_path / "fragments_cache.parquet"
save_fragments(cache, [("AA1 1AA", box(0, 0, 1, 1))])
assert cache.exists()
assert not (tmp_path / "fragments_cache.parquet.tmp").exists()
def test_missing_cache_is_not_fresh(self, tmp_path):
cache = tmp_path / "fragments_cache.parquet"
inp = tmp_path / "uprn.parquet"
inp.write_text("x")
assert fragments_cache_is_fresh(cache, [inp]) is False
def test_cache_newer_than_inputs_is_fresh(self, tmp_path):
import os
inp = tmp_path / "uprn.parquet"
inp.write_text("x")
cache = tmp_path / "fragments_cache.parquet"
cache.write_text("c")
os.utime(inp, (1_000, 1_000))
os.utime(cache, (2_000, 2_000))
assert fragments_cache_is_fresh(cache, [inp, None]) is True
def test_cache_older_than_any_input_is_stale(self, tmp_path):
import os
inp = tmp_path / "oa.gpkg"
inp.write_text("x")
cache = tmp_path / "fragments_cache.parquet"
cache.write_text("c")
os.utime(cache, (1_000, 1_000))
os.utime(inp, (2_000, 2_000)) # input touched after the cache
assert fragments_cache_is_fresh(cache, [inp]) is False
def test_missing_input_is_ignored(self, tmp_path):
cache = tmp_path / "fragments_cache.parquet"
cache.write_text("c")
# arcgis is optional/absent — it cannot have invalidated the cache.
assert fragments_cache_is_fresh(cache, [tmp_path / "absent.parquet"]) is True

View file

@ -79,13 +79,42 @@ def load_uprns(
)
if mapping is not None and mapping.height > 0:
uprns = (
uprns.join(
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
# Remap terminated postcodes to their nearest active successor. The
# successor generally lives in a DIFFERENT OA (and at different grid
# coordinates), so the remapped point must adopt the successor's
# authoritative OA/coords — keeping the terminated postcode's original
# OA would seed the successor into an OA it doesn't belong to, splitting
# its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
# own OA, since a live postcode can legitimately span several OAs.
uprns = uprns.join(
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
if active_postcode_points is not None:
successor_oa = active_postcode_points.rename(
{
"PCDS": "new_postcode",
"GRIDGB1E": "_succ_e",
"GRIDGB1N": "_succ_n",
"OA21CD": "_succ_oa",
}
)
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
)
uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
pl.when("_remapped")
.then(pl.col("_succ_e"))
.otherwise(pl.col("GRIDGB1E"))
.alias("GRIDGB1E"),
pl.when("_remapped")
.then(pl.col("_succ_n"))
.otherwise(pl.col("GRIDGB1N"))
.alias("GRIDGB1N"),
pl.when("_remapped")
.then(pl.col("_succ_oa"))
.otherwise(pl.col("OA21CD"))
.alias("OA21CD"),
)
uprns = uprns.with_columns(
pl.coalesce("new_postcode", "PCDS").alias("PCDS")
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
if active_postcode_points is not None:
active_postcodes = active_postcode_points.select("PCDS").unique()
@ -149,3 +178,37 @@ def get_oa_uprns(
)
postcodes = sub["PCDS"].to_list()
return points, postcodes
def extract_uprn_arrays(df: pl.DataFrame):
"""Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
pyarrow string Array. Multiprocessing workers slice these per OA via
:func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
fork-after-threads deadlock hazard of polars' rayon pool. Being plain
numpy/Arrow buffers (not millions of Python objects), they are shared by
``fork`` copy-on-write rather than duplicated ~1GB per worker.
"""
import pyarrow as pa
east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
postcodes = df["PCDS"].to_arrow()
if isinstance(postcodes, pa.ChunkedArray):
postcodes = postcodes.combine_chunks()
return east, north, postcodes
def get_oa_uprns_arrays(
east: np.ndarray,
north: np.ndarray,
postcodes,
offsets: dict[str, tuple[int, int]],
oa_code: str,
) -> tuple[np.ndarray, list[str]]:
"""Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
:func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
s, e = offsets[oa_code]
points = np.column_stack([east[s:e], north[s:e]])
return points, postcodes.slice(s, e - s).to_pylist()

View file

@ -11,9 +11,9 @@ from pathlib import Path
import numpy as np
import polars as pl
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
from pipeline.transform.price_estimation.index import build_index
from pipeline.transform.price_estimation.knn import (
KNN_BLEND_WEIGHT,
build_knn_pool,
knn_median_psm,
)
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
.exp()
)
.fill_null(pl.col("input_price").cast(pl.Float64))
# Keep null when the index can't be interpolated, matching production
# (estimate.py ships null there). compute_metrics filters to finite
# positive predictions, so these rows correctly drop from the Index n
# rather than silently degrading to the Naive prediction.
.alias("predicted"),
)
return test
@ -265,13 +268,12 @@ def main():
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
)
# Blend: (1-w)*index + w*kNN where both available
# Blend with the exact shipped estimator (stability gate + last-price cap +
# null-when-no-index) so the "Blended" stage reflects production accuracy.
# input_price is the backtest equivalent of production's "Last known price".
index_est = test["predicted"].to_numpy().astype(np.float64)
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
blended = np.where(
knn_valid & np.isfinite(index_est),
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
np.where(np.isfinite(index_est), index_est, knn_est),
blended = guarded_blend_estimates(
index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
)
actual = test["actual_price"].to_numpy().astype(np.float64)

View file

@ -19,6 +19,8 @@ from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
hierarchical_shrinkage,
reanchor_dict,
reanchor_dicts,
shrink_dicts,
spatial_smooth,
)
@ -431,6 +433,17 @@ def build_index(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Re-anchor every repeat-sales dict to the global base year before any
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
# log-index 0 at its OWN earliest year, so cells with shorter histories
# are measured from a later origin; combining them key-by-key would
# otherwise average level-incompatible numbers. The hedonic fallback is
# already anchored at min_year, so we align everything to min_year.
national_idx = reanchor_dict(national_idx, min_year)
area_idx = reanchor_dicts(area_idx, min_year)
district_idx = reanchor_dicts(district_idx, min_year)
sector_idx = reanchor_dicts(sector_idx, min_year)
# Shrinkage: national -> hedonic first, then hierarchical
print(" Applying shrinkage...")
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)

View file

@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
def _base_value(index: dict[int, float], base_year: int) -> float:
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
values are log-levels relative to that origin. To express it on a common
origin we need its value at the shared `base_year`:
- exact hit: use it directly;
- base_year before the dict's history: back-fill, i.e. the earliest known
value (which is 0.0 by construction). We cannot observe the level move
between the global base and a later-starting cell, so we assume none,
matching forward_fill's back-fill convention;
- base_year inside a gap / after history: forward-fill the most recent
prior value.
"""
if base_year in index:
return index[base_year]
years = sorted(index)
if not years or base_year < years[0]:
return index[years[0]] if years else 0.0
prior = [y for y in years if y <= base_year]
return index[prior[-1]]
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
Subtracting the same constant from every year preserves all within-dict
year-to-year differences, so estimate.py's (current - sale) semantics are
unchanged; it only fixes the cross-dict level mismatch before blending.
"""
if not index:
return index
shift = _base_value(index, base_year)
if shift == 0.0:
return index
return {y: v - shift for y, v in index.items()}
def reanchor_dicts(
indices: dict[str, dict[int, float]], base_year: int
) -> dict[str, dict[int, float]]:
"""Re-anchor every index dict in a mapping to the common `base_year`."""
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
"""Shrink dict values toward parent using n/(n+k) weighting.

View file

@ -158,6 +158,53 @@ def test_transform_crime_writes_by_year_output(tmp_path):
assert serious[2024] == 12.0
def test_transform_crime_headline_is_mean_of_per_year_bars(tmp_path):
"""The avg/yr headline must equal the average of the by-year chart bars, i.e.
the simple mean of each year's annualised count -- NOT a month-weighted pooled
rate. They diverge when years have uneven partial-month coverage."""
crime_dir = tmp_path / "crime"
jan23 = crime_dir / "2023-01"
jan24 = crime_dir / "2024-01"
feb24 = crime_dir / "2024-02"
for d in (jan23, jan24, feb24):
d.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
# 2023: 6 burglaries in 1 month -> 6 * 12 / 1 = 72/yr.
(jan23 / "2023-01-test-force-street.csv").write_text(
"\n".join(
[header]
+ [
f"{i},2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"
for i in range(1, 7)
]
)
+ "\n"
)
# 2024: 2 burglaries across 2 months -> 2 * 12 / 2 = 12/yr.
(jan24 / "2024-01-test-force-street.csv").write_text(
"\n".join([header, "7,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
)
(feb24 / "2024-02-test-force-street.csv").write_text(
"\n".join([header, "8,2024-02,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
)
output = tmp_path / "crime.parquet"
by_year_output = tmp_path / "crime_by_year.parquet"
transform_crime(crime_dir, output, by_year_output)
# Mean of per-year bars = (72 + 12) / 2 = 42.0.
# The old pooled rate (8 incidents / 3 months * 12 = 32.0) would be wrong.
avg = pl.read_parquet(output).to_dicts()[0]
assert avg["Burglary (avg/yr)"] == 42.0
by_year = pl.read_parquet(by_year_output).row(0, named=True)
burglary = {p["year"]: p["count"] for p in by_year["Burglary (by year)"]}
assert burglary == {2023: 72.0, 2024: 12.0}
# Headline equals the mean of the bars it summarises.
assert avg["Burglary (avg/yr)"] == sum(burglary.values()) / len(burglary)
def test_transform_crime_fails_without_valid_months(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"

View file

@ -252,6 +252,63 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
# P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
# (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
# distinct years across all postcodes, but only ONE year for P. The headline
# must divide by P's own years-present (1), equalling its single by-year bar
# (24/yr) -- not by the global span (2), which would deflate it to 12/yr.
# The two squares are equal-area, so area normalisation leaves counts as-is.
units = tmp_path / "units"
_write_boundaries(
units,
{
"AB1": [
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
_square_feature("AB1 1AB", 5000, 5000, 5010, 5010),
]
},
)
crime = tmp_path / "crime"
# P: 2 burglaries in a single 2024 month -> 24/yr bar, present in 1 year.
_write_month(
crime,
"2024-01",
[
_crime_row("2024-01", 1005, 1005, "Burglary"),
_crime_row("2024-01", 1005, 1005, "Burglary"),
],
)
# Q: 1 burglary in a far-back 2014 month -> widens the type's global span to
# two years without adding any incident to P.
_write_month(crime, "2014-01", [_crime_row("2014-01", 5005, 5005, "Burglary")])
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
by_year_rows = {
r["postcode"]: r for r in pl.read_parquet(by_year).to_dicts()
}
# P's headline equals the simple mean of its own bars (just the 2024 bar).
p_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AA"]["Burglary (by year)"]}
assert p_bars == {2024: pytest.approx(24.0, abs=0.05)}
# Per-postcode denominator (1) -> 24.0. The old global denominator (2 years
# across all postcodes) would have deflated this to 12.0.
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(24.0, abs=0.05)
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(
sum(p_bars.values()) / len(p_bars), abs=0.05
)
# Q likewise: its sole 2014 bar -> 12/yr, divided by its own 1 year = 12.0.
q_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AB"]["Burglary (by year)"]}
assert q_bars == {2014: pytest.approx(12.0, abs=0.05)}
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
units = tmp_path / "units"
_write_boundaries(

View file

@ -149,6 +149,7 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
@ -201,6 +202,7 @@ def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["N", "N"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
@ -235,6 +237,7 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
@ -259,6 +262,93 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
]
def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
# Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
# pollute latest_price / historical_prices, but the property still survives
# via its standard Category A sales.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [200_000, 250_000, 5_000_000],
"date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
"property_type": ["T", "T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
"paon": ["1", "1", "1"],
"saon": [None, None, None],
"street": ["Example Street", "Example Street", "Example Street"],
"locality": [None, None, None],
"town_city": ["Exampletown", "Exampletown", "Exampletown"],
"duration": ["F", "F", "F"],
"old_new": ["N", "N", "N"],
# The latest (5M) sale is a Category B bulk/portfolio transfer.
"ppd_category": ["A", "A", "B"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# Only the two Category A sales survive; the 5M Category B transfer is dropped.
assert df.get_column("latest_price").to_list() == [250_000]
assert df.get_column("historical_prices").list.len().to_list() == [2]
def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
# A new-build whose earliest sale is below MIN_PRICE must still take that early
# year as its EXACT construction date, while latest_price uses only the
# quality-passing (>=MIN_PRICE) sale.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerow(_row())
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [30_000, 300_000],
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
"property_type": ["T", "T"],
"postcode": ["AA1 1AA", "AA1 1AA"],
"paon": ["1", "1"],
"saon": [None, None],
"street": ["Example Street", "Example Street"],
"locality": [None, None],
"town_city": ["Exampletown", "Exampletown"],
"duration": ["F", "F"],
"old_new": ["Y", "Y"],
"ppd_category": ["A", "A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# Construction year is the genuine earliest transfer (2015), flagged EXACT,
# even though that sale is below MIN_PRICE.
assert df.get_column("construction_age_band").to_list() == [2015]
assert df.get_column("is_construction_date_approximate").to_list() == [0]
# latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
assert df.get_column("latest_price").to_list() == [300_000]
assert df.get_column("historical_prices").list.len().to_list() == [1]
def test_epc_band_to_year_uses_midpoint_and_clamps():
import polars as pl

View file

@ -13,7 +13,9 @@ from pipeline.transform.merge import (
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_coalesce_direct_epc_columns,
_filter_to_active_english_postcodes,
_join_area_side_tables,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
assert loaded["_actual_lat"].to_list() == [51.5]
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
# the same digits-only key as `_normalize_uprn` on the candidate side, so
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
# and stripping non-digits would yield "1000233369560" (a bogus trailing
# zero) which never collides with the candidate key "100023336956".
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
tmp_path,
) -> None:
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(float("nan")) is None
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
# The raw property value is fill_null("No") upstream, so a plain coalesce
# would let a non-null "No" override a directly-matched listing "Yes".
# "Former council house" should fire if EITHER side says "Yes".
none_col = [None] * 5
wide = pl.LazyFrame(
{
"was_council_house": ["No", "Yes", "No", None, None],
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
"current_energy_rating": [None, "C", "D", None, None],
"_direct_current_energy_rating": ["B", "A", None, "E", None],
# _coalesce_direct_epc_columns coalesces every pair in
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
"epc_address": none_col,
"_direct_epc_address": none_col,
"potential_energy_rating": none_col,
"_direct_potential_energy_rating": none_col,
"total_floor_area": none_col,
"_direct_total_floor_area": none_col,
"number_habitable_rooms": none_col,
"_direct_number_habitable_rooms": none_col,
"floor_height": none_col,
"_direct_floor_height": none_col,
"construction_age_band": none_col,
"_direct_construction_age_band": none_col,
"is_construction_date_approximate": none_col,
"_direct_is_construction_date_approximate": none_col,
}
)
result = _coalesce_direct_epc_columns(wide).collect()
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
# Plain coalesce (raw wins when non-null) is untouched for other columns.
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
# The crime table is LEFT-joined per postcode; a postcode absent from it
# must NOT be fabricated as "zero crime" (the safest value). When every
# per-type column is null the Serious/Minor rollups must stay null.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA"],
"Violence and sexual offences (avg/yr)": [1.0],
"Robbery (avg/yr)": [2.0],
"Burglary (avg/yr)": [3.0],
"Possession of weapons (avg/yr)": [4.0],
"Anti-social behaviour (avg/yr)": [1.0],
"Criminal damage and arson (avg/yr)": [1.0],
"Shoplifting (avg/yr)": [1.0],
"Bicycle theft (avg/yr)": [1.0],
"Theft from the person (avg/yr)": [1.0],
"Other theft (avg/yr)": [1.0],
"Vehicle crime (avg/yr)": [1.0],
"Public order (avg/yr)": [1.0],
"Drugs (avg/yr)": [1.0],
"Other crime (avg/yr)": [1.0],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
).collect()
by_postcode = {
row["postcode"]: row
for row in joined.select(
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
).iter_rows(named=True)
}
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
# Missing postcode: rollups stay null rather than fabricating 0.0.
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",

View file

@ -1,9 +1,44 @@
import polars as pl
from pipeline.transform.poi_proximity import (
POI_GROUPS_2KM,
_build_poi_category_groups,
_dynamic_poi_metric_renames,
_groceries_categories,
)
from pipeline.utils.poi_counts import count_pois_per_postcode
def test_groceries_2km_counts_geolytix_brand_categories() -> None:
"""The static groceries 2km count must include GEOLYTIX brand POIs.
GEOLYTIX stores the brand (e.g. "Tesco") in `category` with group
"Groceries" and never emits the literal "Supermarket"; matching only the
OSM strings counts the supermarket but drops the brand store.
"""
postcodes = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"lat": [51.5010],
"lon": [-0.1416],
}
)
pois = pl.DataFrame(
{
"category": ["Tesco", "Supermarket"],
"group": ["Groceries", "Groceries"],
"lat": [51.5011, 51.5012],
"lng": [-0.1417, -0.1418],
}
)
groups_2km = {**POI_GROUPS_2KM, "groceries": _groceries_categories(pois)}
result = count_pois_per_postcode(postcodes, pois, groups=groups_2km, radius_km=2)
# Both the GEOLYTIX brand ("Tesco") and the OSM "Supermarket" must count.
# Pre-fix the static list was ["Greengrocer", "Supermarket", "Convenience
# Store"], so "Tesco" was dropped and this was 1.
assert result["groceries_2km"][0] == 2
def test_dynamic_poi_groups_include_requested_categories_only() -> None:

View file

@ -1,6 +1,10 @@
import polars as pl
from pipeline.transform.transform_poi import transform_grocery_retail_points
from pipeline.transform.transform_poi import (
_load_ofsted_ratings,
_school_icon_category_expr,
transform_grocery_retail_points,
)
def test_transform_grocery_retail_points_outputs_chain_categories():
@ -112,6 +116,33 @@ def test_transform_grocery_retail_points_merges_cooperative_societies():
]
def test_transform_grocery_retail_points_pools_small_coop_societies_before_cutoff():
# Each Co-op society has <5 in-England stores; only after normalising to the
# shared "Co-op" brand do they clear MIN_GROCERY_CHAIN_LOCATIONS together.
societies = [
"Central England Co-operative",
"Lincolnshire Co-operative",
"The Southern Co-operative",
"Midcounties Co-operative",
"Heart of England Co-operative",
]
raw = pl.DataFrame(
{
"id": list(range(1, len(societies) + 1)),
"retailer": societies,
"fascia": ["The Co-operative Food"] * len(societies),
"store_name": [f"Co-op Test {i}" for i in range(1, len(societies) + 1)],
"long_wgs": [-0.141] * len(societies),
"lat_wgs": [51.515] * len(societies),
}
)
pois = transform_grocery_retail_points(raw)
assert pois.height == len(societies)
assert pois["category"].unique().to_list() == ["Co-op"]
def test_transform_grocery_retail_points_accepts_base_fascias():
raw = pl.DataFrame(
{
@ -177,3 +208,87 @@ def test_transform_grocery_retail_points_includes_unmapped_chains_with_five_loca
{"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
{"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
]
def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
# URNs 1-4: graded results map straight through. URNs 5-6: no usable graded
# grade (null/"Not judged") but a good/outstanding ungraded outcome, incl.
# the "(Concerns)"/"(Improving)" suffixes. URN 7: genuinely "Not judged".
# URN 8: a real grade 3 must NOT be overridden by an ungraded outcome.
ofsted_path = tmp_path / "ofsted.parquet"
pl.DataFrame(
{
"URN": [1, 2, 3, 4, 5, 6, 7, 8],
"Latest OEIF overall effectiveness": [
"1",
"2",
"3",
"4",
None,
"Not judged",
"Not judged",
"3",
],
"Ungraded inspection overall outcome": [
None,
None,
None,
None,
"School remains Outstanding",
"School remains Good (Concerns)",
None,
"School remains Outstanding",
],
}
).write_parquet(ofsted_path)
ratings = (
_load_ofsted_ratings(ofsted_path)
.collect()
.sort("urn")
.to_dicts()
)
assert ratings == [
{"urn": 1, "ofsted_rating": "Outstanding"},
{"urn": 2, "ofsted_rating": "Good"},
{"urn": 3, "ofsted_rating": "Requires improvement"},
{"urn": 4, "ofsted_rating": "Inadequate"},
{"urn": 5, "ofsted_rating": "Outstanding"},
{"urn": 6, "ofsted_rating": "Good"},
{"urn": 7, "ofsted_rating": "Not judged"},
{"urn": 8, "ofsted_rating": "Requires improvement"},
]
def test_school_icon_category_handles_one_sided_age_ranges():
# gias._format_age_range emits "up to {high}", "{low}+" and "{low}{high}".
# All three (plus null) must classify, not fall through to "School".
df = pl.DataFrame(
{
"phase": [None, None, None, None, None],
"type_group": [None, None, None, None, None],
# "up to 5" -> nursery; "16+" -> sixth form; "318" -> all-through;
# "411" -> primary; null age_range with null phase -> "School".
"age_range": ["up to 5", "16+", "318", "411", None],
},
# Production reads these from a scanned parquet as String; an all-null
# Python list would otherwise infer the Null dtype and break .str ops.
schema_overrides={
"phase": pl.String,
"type_group": pl.String,
"age_range": pl.String,
},
)
categories = df.select(
_school_icon_category_expr().alias("category")
)["category"].to_list()
assert categories == [
"Nursery school",
"Sixth form",
"All-through school",
"Primary school",
"School",
]

View file

@ -1289,22 +1289,27 @@ def transform_grocery_retail_points(
)
df = df.filter(pl.Series(mask))
eligible_retailers = (
df.group_by("retailer")
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
df = df.with_columns(
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category")
)
eligible_categories = (
df.group_by("category")
.len()
.filter(pl.col("len") >= min_chain_locations)
.select("retailer")
.select("category")
)
df = df.join(eligible_retailers, on="retailer", how="semi")
df = df.join(eligible_categories, on="category", how="semi")
return df.with_columns(
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
pl.coalesce(["store_name", "fascia", "retailer"])
.str.replace_all("''", "'")
.alias("name"),
pl.col("retailer")
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
.alias("category"),
pl.struct(["fascia", "retailer"])
.map_elements(
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
@ -1338,10 +1343,16 @@ def _school_icon_category_expr() -> pl.Expr:
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# age_range is "<min><max>" using an em-dash; both ends may be missing.
age_parts = pl.col("age_range").str.split_exact("", 1)
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
# gias._format_age_range emits three shapes: "<low><high>" (em-dash),
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
# integer as low and the trailing integer as high, then suppress the wrong
# end for the one-sided shapes so they don't collapse to a single bound.
age = pl.col("age_range")
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
# "up to N": no low bound; "N+": no high bound.
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
@ -1386,9 +1397,16 @@ OFSTED_OEIF_LABELS = {
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
only have a report card) is preserved verbatim; null grades drop out."""
the conventional Ofsted labels; when there is no usable graded result
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
report-card framework) we fall back to "Ungraded inspection overall outcome"
so genuinely good/outstanding schools aren't dropped — mirroring
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
# See school_proximity: the ungraded outcome carries "School remains Good"/
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
# suffixes) when the graded column is null/"Not judged".
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
@ -1398,6 +1416,10 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(ungraded.str.starts_with("School remains Outstanding"))
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(ungraded.str.starts_with("School remains Good"))
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)