This commit is contained in:
Andras Schmelczer 2026-06-02 20:14:32 +01:00
parent fbfebc651c
commit aab85fe32e
33 changed files with 2016 additions and 283 deletions

View file

@ -273,6 +273,28 @@ def _write_avg_yr(
for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx]
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
# average over the years in which ANY of those types occurred. This keeps the
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
# Summing the per-type avg/yr values instead (as the merge previously did)
# divides each type by its OWN years-present and overstates the rollup when a
# postcode's serious/minor types occur in disjoint years.
for rollup_name, rollup_types in (
("Serious crime", SERIOUS_CRIME_TYPES),
("Minor crime", MINOR_CRIME_TYPES),
):
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
rollup_years_present = np.clip(
(rollup_counts > 0).sum(axis=1), 1, None
).astype(np.float64)
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
np.float32
)
output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime averages: {output_path}")

View file

@ -106,7 +106,14 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
.alias("potential_energy_rating"),
_clean_string("property_type").alias("epc_property_type"),
_clean_string("built_form").alias("built_form"),
_clean_string("inspection_date").alias("inspection_date"),
# Parse to a real Date once (unparseable/blank -> null) so dedup can
# sort newest-first with nulls_last and _event_year can use dt.year();
# a lexicographic string sort would let a null/garbled date win under
# Polars' default nulls-first descending order. EPC inspection dates
# are ISO (YYYY-MM-DD).
_clean_string("inspection_date")
.str.to_date(format="%Y-%m-%d", strict=False)
.alias("inspection_date"),
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
_clean_number("number_habitable_rooms", pl.Int16).alias(
"number_habitable_rooms"
@ -247,9 +254,11 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
# Dedup fork: keep latest certificate per property (existing logic)
# Dedup fork: keep latest certificate per property. inspection_date is a typed
# Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
# null/unparseable-dated one so the genuinely newest certificate is chosen.
epc = (
epc_base.sort("inspection_date", descending=True)
epc_base.sort("inspection_date", descending=True, nulls_last=True)
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.drop("tenure")
@ -303,11 +312,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
)
.filter(pl.col("_event").is_not_null())
.with_columns(
pl.col("inspection_date")
.cast(pl.String)
.str.slice(0, 4)
.cast(pl.Int32)
.alias("_event_year"),
pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
)
.group_by("_epc_match_address", "_epc_match_postcode")
.agg(

View file

@ -807,6 +807,22 @@ def _remap_terminated_postcodes(
)
def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
"""Keep one row per (postcode, pp_address) — the most-recent transaction.
The terminated-postcode remap can map two distinct postcodes onto one active
successor, collapsing the same physical address onto a single
(postcode, pp_address) key with conflicting sale records. Keep the row with
the latest date_of_transfer so the headline price/date reflect the most
recent transaction; genuinely distinct addresses (a different pp_address) are
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
never merges unrelated rows.
"""
return wide.sort(
"date_of_transfer", descending=True, nulls_last=True
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
def _filter_to_active_english_postcodes(
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
) -> pl.LazyFrame:
@ -837,38 +853,19 @@ def _join_area_side_tables(
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
base = base.join(crime, on="postcode", how="left")
serious_crime_cols = [
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
]
minor_crime_cols = [
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
]
# The LEFT join leaves every per-type column null for postcodes absent from
# the crime table; sum_horizontal alone would fabricate a "zero crime"
# rollup there, so keep the rollup null when ALL components are null.
base = base.with_columns(
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(serious_crime_cols))
.alias("serious_crime_avg_yr"),
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
.then(None)
.otherwise(pl.sum_horizontal(minor_crime_cols))
.alias("minor_crime_avg_yr"),
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
# precomputes the Serious/Minor headline rollups as the mean of the by-year
# rollup bars; read those straight through (renamed to the internal columns
# _finalize_merged_columns expects) rather than re-summing the per-type
# avg/yr columns — summing divides each type by its OWN years-present and
# overstates the rollup when types differ in coverage. A postcode absent from
# the crime table keeps null rollups via the left join (no fabricated zero);
# the per-type avg/yr columns pass through unchanged for display.
base = base.join(crime, on="postcode", how="left").rename(
{
"Serious crime (avg/yr)": "serious_crime_avg_yr",
"Minor crime (avg/yr)": "minor_crime_avg_yr",
}
)
base = base.join(median_age, on="lsoa21", how="left")
@ -881,7 +878,37 @@ def _join_area_side_tables(
)
if tree_density is not None:
base = base.join(tree_density, on="postcode", how="left")
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
# Broadband is the one side table sourced straight from a third-party CSV
# (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
# step, so its postcode may drift in spacing/casing from the NSPL `pcds`
# base key. Normalize BOTH sides to the same canonical pcds form (reusing
# `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
# before joining, otherwise a real postcode silently misses and its
# `max_download_speed` reads as null "no data" downstream. Re-aggregate on
# the canonical key so two raw spellings collapsing to one key can't fan out
# the base; drop a null canonical key so an unparseable Ofcom row joins
# nothing rather than matching a null-key base row.
broadband_canonical = (
broadband.with_columns(
_canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
)
.drop_nulls("_bb_canonical_postcode")
.group_by("_bb_canonical_postcode")
.agg(pl.col("max_download_speed").max())
)
return (
base.with_columns(
_canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
)
.join(
broadband_canonical,
left_on="_base_canonical_postcode",
right_on="_bb_canonical_postcode",
how="left",
)
.drop("_base_canonical_postcode")
)
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
@ -1328,7 +1355,7 @@ def _load_direct_epc_candidates(
)
return (
epc_base.sort("inspection_date", descending=True)
epc_base.sort("inspection_date", descending=True, nulls_last=True)
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
.first()
.join(
@ -1918,6 +1945,10 @@ def _build(
# terminated English postcodes are retained under their successor postcode.
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
# The remap can collapse two terminated postcodes onto one active successor,
# duplicating a physical address's (postcode, pp_address) key; keep only the
# most-recent transaction per address before the per-postcode joins.
wide = _dedupe_collapsed_properties(wide)
arcgis_raw = pl.scan_parquet(arcgis_path)
arcgis = _active_english_postcode_area(arcgis_raw)
active_postcodes = arcgis.select("postcode").unique()

View file

@ -164,19 +164,39 @@ def _read_noise_tile(
for info in candidates:
with rasterio.open(info.path) as source:
# The Defra rasters encode genuine "quiet / below threshold" as the
# value 0.0 (only -96.0 is true nodata). Mask both BEFORE
# reprojecting so resampling never blends a 0 cell into an adjacent
# loud corridor and fabricates a halo of intermediate dB.
#
# Lden values are dB (a logarithmic scale), so bilinear resampling
# would arithmetically average neighbouring dB cells, which is
# acoustically wrong (it diluted a 75 dB peak to ~53 dB in tests)
# and inconsistent with the postcode sampler. Use Resampling.max:
# it preserves peak corridors, never invents an intermediate dB
# between a masked (NaN) quiet cell and a loud one, and mirrors the
# max semantics of sample_noise_at_postcodes.
src_arr = source.read(1).astype(np.float32)
nodata = source.nodata
invalid = ~np.isfinite(src_arr) | (src_arr <= 0)
if nodata is not None:
invalid |= np.isclose(
src_arr, np.float32(nodata), rtol=1e-5, atol=1e-5
)
src_arr = np.where(invalid, np.float32("nan"), src_arr)
tile = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
reproject(
source=rasterio.band(source, 1),
source=src_arr,
destination=tile,
src_transform=source.transform,
src_crs=source.crs,
src_nodata=source.nodata if source.nodata is not None else 0,
src_nodata=float("nan"),
dst_transform=from_bounds(
left, bottom, right, top, tile_size, tile_size
),
dst_crs=WEB_MERCATOR_CRS,
dst_nodata=np.nan,
resampling=Resampling.bilinear,
resampling=Resampling.max,
)
tile[~np.isfinite(tile) | (tile <= 0)] = np.nan

View file

@ -27,7 +27,7 @@ from .output import (
to_wgs84_geojson_multi,
write_district_geojson,
)
from .process_oa import _extract_polygonal, process_oa
from .process_oa import MIN_GEOM_AREA, _extract_polygonal, process_oa
from .uprn import get_oa_uprns, load_uprns
from .voronoi import _equal_split_fallback, compute_voronoi_regions
@ -341,6 +341,65 @@ class TestVoronoiDeduplication:
assert "B" in result, "Postcode B missing with int64 coords"
class TestVoronoiCoincidentClusterNotCrushed:
"""3+ postcodes at one coordinate must each keep a real cell.
Pre-fix, the first coincident postcode stayed unjittered at the exact
cluster centre; with other seeds in the OA its Voronoi cell was squeezed
below MIN_GEOM_AREA, so _clean_polygonal dropped that active postcode
downstream. The fix spreads coincident postcodes onto a small regular
polygon (equal wedges), so none is crushed.
"""
def test_coincident_cluster_plus_outer_seed_no_postcode_crushed(self):
# A block of flats: 4 distinct postcodes share one building coordinate,
# plus one other postcode elsewhere in the OA. Pre-fix, the centre seed's
# cell collapsed to ~0.0001 m^2 (< MIN_GEOM_AREA) and the postcode was
# dropped; every postcode must now keep a non-degenerate cell.
boundary = box(0, 0, 1000, 1000)
points = np.array(
[
[500, 500], # A — coincident
[500, 500], # B — coincident
[500, 500], # C — coincident
[500, 500], # D — coincident
[100, 100], # OUT — elsewhere in the OA
],
dtype=np.float64,
)
postcodes = ["A", "B", "C", "D", "OUT"]
result = compute_voronoi_regions(points, postcodes, boundary)
for pc in postcodes:
assert pc in result, f"Postcode {pc} was dropped"
assert result[pc].area > MIN_GEOM_AREA, (
f"Postcode {pc} cell {result[pc].area} <= MIN_GEOM_AREA"
)
def test_coincident_cluster_partitions_into_fair_wedges(self, square_boundary):
# N postcodes sharing one coordinate split the surrounding area into
# roughly equal wedges (regular-polygon seeds), none degenerate.
points = np.array([[500050, 180050]] * 5, dtype=np.float64)
postcodes = ["A", "B", "C", "D", "E"]
result = compute_voronoi_regions(points, postcodes, square_boundary)
fair_share = square_boundary.area / len(postcodes)
for pc in postcodes:
assert pc in result, f"Postcode {pc} was dropped"
# Each wedge is a meaningful fraction of its fair share (not crushed).
assert result[pc].area > 0.3 * fair_share, (
f"Postcode {pc} cell {result[pc].area} far below fair share {fair_share}"
)
def test_two_coincident_split_is_fair(self, square_boundary):
"""Regression: two postcodes at one coordinate split ~50/50."""
points = np.array([[500050, 180050], [500050, 180050]], dtype=np.float64)
postcodes = ["A", "B"]
result = compute_voronoi_regions(points, postcodes, square_boundary)
assert "A" in result and "B" in result
total = result["A"].area + result["B"].area
assert result["A"].area / total > 0.4
assert result["B"].area / total > 0.4
# ---------------------------------------------------------------------------
# Bug 4: Voronoi collinear fallback gives everything to first postcode
# ---------------------------------------------------------------------------

View file

@ -20,33 +20,48 @@ def compute_voronoi_regions(
# Convert to float64 so sub-metre jitter isn't truncated.
points = points.astype(np.float64)
# Deduplicate points, keeping one per (location, postcode) pair.
# Multiple postcodes at the same coordinate each get their own point,
# jittered by a tiny offset (0.01m) so Voronoi can distinguish them.
# Coords are rounded to mm precision for stable hashing — UPRN inputs are
# already integer metres, but the float64 cast can introduce ULP noise.
GOLDEN_ANGLE = np.pi * (3.0 - np.sqrt(5.0))
# Deduplicate points, keeping one per (location, postcode) pair. Coords are
# rounded to mm precision for stable hashing — UPRN inputs are already integer
# metres, but the float64 cast can introduce ULP noise.
#
# Where several DISTINCT postcodes share one coordinate, jitter ALL of them
# onto a small regular polygon (equal 0.01m radius, equally spaced by angle)
# so their Voronoi cells become equal wedges and NONE is crushed. Leaving any
# seed at the centre — or innermost on a spiral — squeezes its cell below
# MIN_GEOM_AREA, which _clean_polygonal then drops downstream, silently losing
# an active postcode. Seeds at a UNIQUE coordinate are left exactly on their
# UPRN (no perturbation of normal Voronoi output). Coords are rounded to mm
# for stable hashing (the float64 cast can add ULP noise).
rounded_coords = [
(round(float(points[i, 0]), 3), round(float(points[i, 1]), 3))
for i in range(len(points))
]
coord_postcodes: dict[tuple[float, float], set[str]] = defaultdict(set)
for coord, pc in zip(rounded_coords, postcodes):
coord_postcodes[coord].add(pc)
seen: dict[tuple[float, float, str], bool] = {}
unique_pts = []
unique_pcs = []
coord_counts: dict[tuple[float, float], int] = defaultdict(int)
for i in range(len(points)):
coord = (round(float(points[i, 0]), 3), round(float(points[i, 1]), 3))
coord = rounded_coords[i]
key = (coord[0], coord[1], postcodes[i])
if key not in seen:
seen[key] = True
jitter_idx = coord_counts[coord]
coord_counts[coord] += 1
if jitter_idx == 0:
unique_pts.append(points[i].copy())
else:
# Golden-angle spacing distributes any number of jittered
# points evenly around (and outward from) the original coord.
count = len(coord_postcodes[coord])
if count > 1:
# Coincident cluster: equally-spaced regular polygon -> equal
# Voronoi wedges, so every postcode here keeps a fair share.
jitter_idx = coord_counts[coord]
coord_counts[coord] += 1
angle = 2.0 * np.pi * jitter_idx / count
jittered = points[i].copy()
angle = jitter_idx * GOLDEN_ANGLE
jittered[0] += 0.01 * np.cos(angle)
jittered[1] += 0.01 * np.sin(angle)
unique_pts.append(jittered)
else:
unique_pts.append(points[i].copy())
unique_pcs.append(postcodes[i])
if len(unique_pts) == 1:

View file

@ -19,8 +19,7 @@ from tqdm import tqdm
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
hierarchical_shrinkage,
reanchor_dict,
reanchor_dicts,
lift_onto_parent,
shrink_dicts,
spatial_smooth,
)
@ -169,33 +168,47 @@ def solve_robust_index(
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
# Temporal smoothness prior: penalise curvature in the year betas with a
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
# zero target). This damps single-year index spikes without flattening
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
# year (min_year, implicit beta=0) has no column, so the penalty spans the
# non-baseline years only. For cells with <3 betas there is no curvature to
# penalise and the solve is unchanged.
# second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
# least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
# The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
# for the consecutive triple (y0, y1, y2), so gap years are not treated as
# adjacent: a multi-year gap relaxes the penalty (correctly preserving a
# genuine level jump) instead of forcing a smooth ramp. For unit spacing
# (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
# This damps single-year index spikes without flattening genuine trends.
# Betas are ordered by calendar year; the baseline year (min_year, implicit
# beta=0) has no column, so the penalty spans the non-baseline years only.
# For cells with <3 betas there is no curvature to penalise and the solve is
# unchanged.
n_pen = 0
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
cols_by_year = [c for _, c in sorted(year_to_col.items())]
years_sorted = sorted(year_to_col)
cols_by_year = [year_to_col[y] for y in years_sorted]
n_pen = n_cols - 2
pen_rows = np.repeat(n + np.arange(n_pen), 3)
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
pen_vals = np.empty(n_pen * 3, dtype=np.float64)
for k in range(n_pen):
pen_cols[3 * k : 3 * k + 3] = (
cols_by_year[k],
cols_by_year[k + 1],
cols_by_year[k + 2],
)
y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
pen_vals[3 * k : 3 * k + 3] = (
sqrt_lambda * w0,
sqrt_lambda * w1,
sqrt_lambda * w2,
)
pen_rows_arr = pen_rows.astype(np.int64)
pen_cols_arr = pen_cols
pen_vals_arr = np.tile(
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
).astype(np.float64)
pen_vals_arr = pen_vals
pen_b = np.zeros(n_pen, dtype=np.float64)
n_total_rows = n + n_pen
@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
idx = solve_robust_index(y1, y2, lr, w)
if idx:
indices[key] = idx
n_pairs[key] = len(y1)
# Count only information-bearing pairs: same-year (year1==year2) and
# baseline-baseline pairs cancel in the sparse solve and contribute
# zero information to the annual index, so including them would
# inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
n_pairs[key] = int(np.count_nonzero(y2 != y1))
return indices, n_pairs
@ -433,20 +450,17 @@ def build_index(
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
)
# Re-anchor every repeat-sales dict to the global base year before any
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
# log-index 0 at its OWN earliest year, so cells with shorter histories
# are measured from a later origin; combining them key-by-key would
# otherwise average level-incompatible numbers. The hedonic fallback is
# already anchored at min_year, so we align everything to min_year.
national_idx = reanchor_dict(national_idx, min_year)
area_idx = reanchor_dicts(area_idx, min_year)
district_idx = reanchor_dicts(district_idx, min_year)
sector_idx = reanchor_dicts(sector_idx, min_year)
# Shrinkage: national -> hedonic first, then hierarchical
# Shrinkage: national -> hedonic first, then hierarchical. Each cell is
# anchored to log-index 0 at its OWN earliest year (solve_robust_index),
# so cells with shorter histories sit on a later origin than their wider
# parents. Before each blend we lift the child onto its parent's base at
# the child's first year (lift_onto_parent) -- otherwise combining them
# key-by-key averages level-incompatible numbers. The hedonic fallback is
# anchored at the global min_year, so it serves as the base for national.
print(" Applying shrinkage...")
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
national_shrunk = shrink_dicts(
lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
)
sector_shrunk = hierarchical_shrinkage(
sector_idx,
sector_n,
@ -459,6 +473,7 @@ def build_index(
sector_to_dist,
dist_to_area,
shrink_dicts,
lift_onto_parent,
)
# Spatial smoothing

View file

@ -142,6 +142,20 @@ def _sale_identity_matches(
target_price: float,
target_sale_date: int,
) -> np.ndarray:
"""Mark pool comparables that are (almost certainly) the target's own sale.
properties.parquet has no per-property id, so a sale is identified by the
proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
prior sale out of its comparable set (leakage prevention).
Limitation: new-build / bulk blocks sell many DISTINCT properties in one
postcode on the same day at the same price, so all such siblings collide on
this proxy and are excluded together. This is intentional conservative
over-exclusion: it guarantees no leakage at the cost of occasionally
dropping legitimate same-(postcode, price, date) siblings. The effect is
bounded (~1.8% of the pool) and a precise fix would require a per-property
id that the data does not carry.
"""
if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
return np.zeros(len(pool_postcodes), dtype=bool)
return (
@ -166,6 +180,16 @@ def knn_median_psm(
PSM is at the reference date used when building the pool.
NaN where not computable (missing coords, unknown type, too few neighbors).
Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
postcode), so every property within a postcode is co-located. For a dense
postcode the "k nearest" therefore degenerates into an arbitrary
same-postcode subset whose membership is decided by KDTree index order
rather than true proximity. No property-level coordinates exist to fix this,
so the kNN signal is treated as a weak, noisy prior: the downstream guarded
blend (guarded_blend_estimates) only blends kNN when it is close to the
index estimate and otherwise discards it, bounding the impact of this
degeneracy. The result is deterministic for a fixed pool order.
"""
n = len(lat)
result = np.full(n, np.nan)

View file

@ -36,26 +36,43 @@ def _base_value(index: dict[int, float], base_year: int) -> float:
return index[prior[-1]]
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
def lift_onto_parent(
child: dict[int, float], parent: dict[int, float]
) -> dict[int, float]:
"""Lift a child index onto its parent's base before blending the two.
Subtracting the same constant from every year preserves all within-dict
year-to-year differences, so estimate.py's (current - sale) semantics are
unchanged; it only fixes the cross-dict level mismatch before blending.
solve_robust_index anchors every cell to log-index 0 at its OWN earliest
year, so a cell with a shorter history sits on a later origin than its
(wider) parent. Combining them key-by-key would average level-incompatible
numbers (a sector measured from 2008 blended with a district measured from
1996). We add the parent's accumulated level at the child's first year, so
``child[start] == parent[start]``: the child's own year-to-year moves are
layered on top of the parent's growth up to that point -- the same
assumption shrinkage already makes for years the child lacks.
Re-basing on each cell's OWN earliest year (rather than the global base,
which the child cannot observe) is what makes this effective: subtracting
the child's value at the global base is always 0 and changes nothing.
The shift is a single constant added to every year of the child, so the
child's own year-to-year differences are preserved. PRECONDITION for the
downstream estimate to be unaffected within the child's range: the parent's
year coverage must be a superset of the child's. This holds throughout
build_index, where each parent aggregates a superset of its children's sale
pairs, so shrink_dicts blends every child year against a present parent year
and the constant shift cancels in a within-range (current - sale) difference;
only comparisons that span the child's start year (e.g. a sale predating the
cell's own data) change. If a caller violates the precondition (a child year
the parent lacks), shrink_dicts passes that year through unshrunk and the
cancellation no longer holds.
"""
if not index:
return index
shift = _base_value(index, base_year)
if shift == 0.0:
return index
return {y: v - shift for y, v in index.items()}
def reanchor_dicts(
indices: dict[str, dict[int, float]], base_year: int
) -> dict[str, dict[int, float]]:
"""Re-anchor every index dict in a mapping to the common `base_year`."""
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
if not child or not parent:
return child
child_start = min(child)
offset = _base_value(parent, child_start) - child[child_start]
if offset == 0.0:
return child
return {y: v + offset for y, v in child.items()}
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
@ -84,30 +101,40 @@ def hierarchical_shrinkage(
sector_to_dist: dict[str, str],
dist_to_area: dict[str, str],
shrink_fn: Callable[[V, V, int], V],
lift_fn: Callable[[V, V], V] | None = None,
) -> dict[str, V]:
"""Top-down hierarchical shrinkage: area->top, district->area, sector->district.
`top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
`lift_fn(raw, parent)`, if given, re-bases raw onto its parent before blending
(see lift_onto_parent); pass None for category-keyed dicts where re-basing is
meaningless.
"""
def combine(raw: V, parent: V, n: int) -> V:
if lift_fn is not None:
raw = lift_fn(raw, parent)
return shrink_fn(raw, parent, n)
# Area -> top level
area_shrunk = {}
for area, val in area_vals.items():
area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
area_shrunk[area] = combine(val, top_level, area_n[area])
# District -> area
district_shrunk = {}
for dist, val in district_vals.items():
a = dist_to_area.get(dist, "")
parent = area_shrunk.get(a, top_level)
district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
district_shrunk[dist] = combine(val, parent, district_n[dist])
# Sector -> district
sector_shrunk = {}
for sec, val in sector_vals.items():
d = sector_to_dist.get(sec, "")
parent = district_shrunk.get(d, top_level)
sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
sector_shrunk[sec] = combine(val, parent, sector_n[sec])
# Fill sectors without their own values
for sec in all_sectors:

View file

@ -0,0 +1,135 @@
import numpy as np
import polars as pl
from pipeline.transform.price_estimation import index as index_mod
from pipeline.transform.price_estimation.index import (
compute_indices_for_level,
solve_robust_index,
)
def _pairs_from_path(true_levels: dict[int, float]):
"""Build adjacent-year repeat-sale pairs that exactly trace a known path.
Each consecutive pair's log_ratio is the difference of the true log-levels,
so the solver should recover the levels exactly (relative to the min year).
"""
years = sorted(true_levels)
y1, y2, lr, w = [], [], [], []
for a, b in zip(years[:-1], years[1:]):
y1.append(a)
y2.append(b)
lr.append(true_levels[b] - true_levels[a])
w.append(1.0)
return (
np.array(y1, dtype=np.int32),
np.array(y2, dtype=np.int32),
np.array(lr, dtype=np.float64),
np.array(w, dtype=np.float64),
)
def test_solver_recovers_contiguous_path():
"""A contiguous price path is recovered as log-levels relative to min_year.
Proves the IRLS solver is correct (and unchanged) for contiguous data: the
spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
"""
years = range(2010, 2021)
true = {y: 0.04 * (y - 2010) for y in years} # smooth (zero curvature) ramp
# Replicate each adjacent pair so MIN_PAIRS is comfortably met.
y1, y2, lr, w = _pairs_from_path(true)
y1 = np.tile(y1, 3)
y2 = np.tile(y2, 3)
lr = np.tile(lr, 3)
w = np.tile(w, 3)
idx = solve_robust_index(y1, y2, lr, w)
assert idx[2010] == 0.0 # baseline anchor
for y in years:
assert abs(idx[y] - (true[y] - true[2010])) < 1e-3
def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
"""FIX #5: a sharp true level jump across a multi-year gap is preserved.
Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
(beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
spacing-aware second difference relaxes the penalty across the gap.
"""
# True log-levels relative to min_year (2000 anchored at 0).
true = {
2000: 0.0,
2001: 0.05,
2002: 0.10,
2015: 1.10, # sharp +1.0 jump across the gap
2016: 1.15,
}
y1, y2, lr, w = [], [], [], []
def add(a, b, n=4):
for _ in range(n):
y1.append(a)
y2.append(b)
lr.append(true[b] - true[a])
w.append(1.0)
# In-segment adjacent pairs.
add(2000, 2001)
add(2001, 2002)
add(2015, 2016)
# Cross-gap pairs consistent with the sharp jump.
add(2002, 2015)
add(2002, 2016)
y1 = np.array(y1, dtype=np.int32)
y2 = np.array(y2, dtype=np.int32)
lr = np.array(lr, dtype=np.float64)
w = np.array(w, dtype=np.float64)
# Use a strong penalty to make the smoothing bias obvious.
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
try:
idx = solve_robust_index(y1, y2, lr, w)
finally:
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
assert idx[2000] == 0.0 # baseline anchor
# beta_2015 must stay near its true post-gap level, not get dragged down by a
# spurious curvature penalty that treats the gap as a single-year step.
assert abs(idx[2015] - true[2015]) < 0.05
def test_n_pairs_counts_only_cross_year_pairs():
"""FIX #12: same-year pairs carry zero index information and must not inflate
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
rows = []
def add_pairs(group, year1, year2, n):
for _ in range(n):
rows.append(
{
"grp": group,
"year1": year1,
"year2": year2,
"log_ratio": 0.03 * (year2 - year1),
"weight": 1.0,
}
)
# 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
# zero-information same-year pairs that must not be counted.
add_pairs("g", 2010, 2011, 4)
add_pairs("g", 2011, 2012, 4)
add_pairs("g", 2012, 2012, 3) # same-year, zero info
pairs = pl.DataFrame(rows)
indices, n_pairs = compute_indices_for_level(pairs, "grp")
assert "g" in indices
assert n_pairs["g"] == 8 # not 11

View file

@ -71,9 +71,49 @@ def test_knn_excludes_same_sale_and_uses_stable_comparables():
),
)
# The five 900k same-postcode siblings share the target's (postcode, price,
# date) identity proxy, so they are all excluded as comparables, leaving the
# 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
# INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
# exists to distinguish a target's own resale from a distinct bulk-block
# sibling sold same-day at the same price), not ideal behaviour -- see the
# _sale_identity_matches docstring.
assert psm[0] == 2_500.0
def test_knn_median_psm_is_deterministic():
"""Reproducibility guard (BUG #6): within-postcode neighbours are co-located
(one centroid per postcode), so the kNN result for dense postcodes depends on
an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
two identical calls against the same trees/inputs return identical output, so
future refactors cannot silently introduce run-to-run nondeterminism."""
sale_date = date(2026, 1, 1)
rows = [
{
"Postcode": "AA1 1AA",
"Property type": "Detached",
"lat": 51.5000 + i * 0.00001,
"lon": -0.1000,
"Total floor area (sqm)": 80.0,
"Last known price": 200_000.0 + i * 1_000.0,
"Date of last transaction": sale_date,
}
for i in range(40)
]
df = pl.DataFrame(rows)
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
args = dict(
lat=np.array([51.5000, 51.5002]),
lon=np.array([-0.1000, -0.1000]),
type_groups=np.array(["Detached", "Detached"]),
)
first = knn_median_psm(trees, **args)
second = knn_median_psm(trees, **args)
assert np.array_equal(first, second)
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
blended = guarded_blend_estimates(
index_est=np.array([120_000.0, 1_000_000.0]),

View file

@ -1,99 +1,117 @@
"""Regression tests for common-base-year re-anchoring before blending.
"""Regression tests for parent-base lifting before hierarchical blending.
Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
different base years must be re-anchored to a single common base first, or the
solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
earliest year, so a cell with a shorter history sits on a later origin than its
(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
must first be lifted onto its parent's base at the child's first year, or the
blend averages level-incompatible numbers (fix5-index-base-year).
Note: re-anchoring each cell to the *global* base year is a no-op on real data
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
global base is never later), which is why the fix lifts onto the *parent* at the
child's own start year instead.
"""
from pipeline.transform.price_estimation.shrinkage import (
blend_dicts,
reanchor_dict,
reanchor_dicts,
hierarchical_shrinkage,
lift_onto_parent,
shrink_dicts,
)
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
def test_reanchor_is_pure_constant_shift_preserving_differences():
"""Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
# Anchored at its own earliest year 2008.
idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
def test_lift_rebases_late_starting_child_onto_parent():
"""A child anchored at its own later start year is lifted to the parent's level there."""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
# Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
sector = {2016: 0.0, 2024: 0.20}
reanchored = reanchor_dict(idx, 1996)
# 1996 is before this dict's history -> back-fill earliest value (0.0),
# so the shift is 0 and the dict is unchanged.
assert reanchored[2008] == 0.0
lifted = lift_onto_parent(sector, parent)
# Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
reanchored_2010 = reanchor_dict(idx, 2010)
assert reanchored_2010[2010] == 0.0
# All within-dict differences are preserved under the constant shift.
years = sorted(idx)
for a, b in zip(years, years[1:]):
assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
# child[start] now equals the parent's accumulated level at that year.
assert abs(lifted[2016] - parent[2016]) < 1e-12 # 1.20
assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12 # 1.40
# Pure constant shift: the child's own year-to-year move is preserved.
assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12
def test_blend_different_base_years_needs_reanchoring():
"""Blending two dicts on different bases is biased unless re-anchored first.
def test_lift_is_noop_when_child_starts_at_parent_base():
"""A child whose earliest year is the parent's base (value 0) is unchanged."""
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
assert lift_onto_parent(child, parent) == child
Both cells observe the common base year 1996 but were anchored to DIFFERENT
origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
cells whose pair history starts at different years). They describe the SAME
true trajectory measured from 1996, so a 50/50 blend should reproduce that
common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
sectorA's 1996-relative numbers, level-shifting the smoothed result.
def test_lift_handles_empty_inputs():
assert lift_onto_parent({}, {2000: 0.0}) == {}
assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
"""The lift corrects comparisons that span the cell's start year, and ONLY those.
A property sold in 2008 (before the sector's own data begins in 2016) and
valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
with 1996-based parent levels and badly understates the move. Comparisons
wholly inside the sector's own range (2016->2024) are unchanged, because the
lift is a pure constant shift that cancels in a within-cell difference.
"""
base_year = 1996
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
sector = {2016: 0.0, 2024: 0.20} # own data starts 2016
n = 30
w = n / (n + SHRINKAGE_K)
# True log-levels relative to 1996 (identical trajectory for both cells).
truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
raw = shrink_dicts(sector, parent, n) # pre-fix: blend without lifting
fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)
# sectorA: anchored at 1996 (its earliest year) -> equals truth.
sector_a = dict(truth)
# sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
# every year), exactly how solve_robust_index would express a cell whose
# earliest year happened to be picked as 2008.
shift_b = truth[2008]
sector_b = {y: v - shift_b for y, v in truth.items()}
# Within the sector's own range the lift changes nothing.
assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12
# --- Pre-fix behaviour: blend the raw dicts directly. ---
raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
# Every year is pulled by half of shift_b (0.4) away from the truth.
assert abs(raw_blend[2012] - truth[2012]) > 0.3
assert abs(raw_blend[1996] - truth[1996]) > 0.3
# 2008 is parent-only in both (sector absent), so both read parent[2008].
assert abs(raw[2008] - parent[2008]) < 1e-12
assert abs(fixed[2008] - parent[2008]) < 1e-12
# --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
# Both cells now read 0 at 1996 and the true level at every shared year.
for y in truth:
assert abs(fixed_blend[y] - truth[y]) < 1e-9
raw_move = raw[2024] - raw[2008]
fixed_move = fixed[2024] - fixed[2008]
# Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
# The fix raises the spanning move by exactly the parent growth to the
# sector's start year that the raw blend dropped (weighted by w).
assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
# Fixed move is close to the true area-level move (0.70); raw badly understates it.
assert abs(fixed_move - 0.70) < 0.2
assert raw_move < 0.4 * fixed_move
def test_shrink_dicts_after_reanchoring_is_consistent():
"""Shrinking a cell toward its parent must use a common origin."""
base_year = 2000
# Parent (national) anchored at 2000.
parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
# Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
# every year), as solve_robust_index would express a cell whose earliest year
# is later. It still observes the 2000 base year (value -0.50).
sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
n = 0 # no own data weight -> result should equal parent after anchoring
def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
"""Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
sector_n = {"AB1 1": 300}
# No own area/district indices -> the sector shrinks straight toward `top`.
base_args = (
sector,
sector_n,
{},
{},
{},
{},
top,
["AB1 1"],
{"AB1 1": "AB1"},
{"AB1": "AB"},
shrink_dicts,
)
reanchored_sector = reanchor_dict(sector, base_year)
# Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
# origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
shrunk = shrink_dicts(reanchored_sector, parent, n)
assert abs(shrunk[2000] - 0.0) < 1e-9
assert abs(shrunk[2010] - 0.50) < 1e-9
assert abs(shrunk[2020] - 1.20) < 1e-9
without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]
def test_reanchor_exact_hit_shifts_all_years():
"""When the base year is present, subtract its value from every year."""
idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
reanchored = reanchor_dict(idx, 2005)
assert reanchored[2005] == 0.0
assert abs(reanchored[1996] - (-0.30)) < 1e-12
assert abs(reanchored[2015] - 0.60) < 1e-12
# Within the sector's own range: identical (pure constant shift cancels).
assert abs(
(with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
) < 1e-12
# Spanning the sector's start year: the lift raises the 2008->2024 move.
assert (with_lift[2024] - with_lift[2008]) > (
without_lift[2024] - without_lift[2008]
) + 0.1

View file

@ -252,6 +252,47 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
# bars (which span the UNION of years any serious type occurred), NOT the sum
# of the per-type means. Summing per-type means divides each type by its OWN
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
# per-year serious total by the years any serious type occurred (2) -> 12.
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
_write_month(crime, "2014-01", [_crime_row("2014-01", 1005, 1005, "Burglary")])
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Robbery")])
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
avg = pl.read_parquet(output).row(0, named=True)
# The precomputed rollup headline exists and equals the mean of the bars (12),
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
assert "Serious crime (avg/yr)" in avg
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
serious_bars = {
p["year"]: p["count"]
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
}
assert serious_bars == {
2014: pytest.approx(12.0, abs=0.05),
2024: pytest.approx(12.0, abs=0.05),
}
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
# P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
# (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO

View file

@ -58,7 +58,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
"potential_energy_rating": "B",
"epc_property_type": "House",
"built_form": "Mid-Terrace",
"inspection_date": "2024-01-02",
"inspection_date": date(2024, 1, 2),
"total_floor_area": 84.5,
"number_habitable_rooms": None,
"floor_height": 2.4,
@ -179,6 +179,65 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
assert df.get_column("historical_prices").list.len().to_list() == [2]
def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
# Two certificates for the same property. The cert with the garbled,
# unparseable inspection_date must NOT be chosen as "latest": a string sort
# nulls-first would have picked it, attaching a stale rating/floor area. The
# valid-dated cert wins, so its rating ("C") and floor area (85) survive.
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
csv_buffer = io.StringIO()
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
writer.writeheader()
writer.writerows(
[
_row(
current_energy_rating="c",
inspection_date="2024-01-01",
total_floor_area="85",
),
# Same property; an unparseable date (OCR/garbled). Under a raw
# string descending sort "not-a-date" outranks the ISO date and
# wins the dedup, but as a null Date it loses.
_row(
current_energy_rating="g",
inspection_date="not-a-date",
total_floor_area="40",
),
]
)
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
price_paid_path = tmp_path / "price-paid.parquet"
pl.DataFrame(
{
"price": [250_000],
"date_of_transfer": [date(2024, 2, 3)],
"property_type": ["T"],
"postcode": ["AA1 1AA"],
"paon": ["1"],
"saon": [None],
"street": ["Example Street"],
"locality": [None],
"town_city": ["Exampletown"],
"duration": ["F"],
"old_new": ["N"],
"ppd_category": ["A"],
}
).write_parquet(price_paid_path)
output_path = tmp_path / "epc-pp.parquet"
_run(zip_path, price_paid_path, output_path, tmp_path)
df = pl.read_parquet(output_path)
assert df.height == 1
# The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
{"current_energy_rating": "C", "total_floor_area": 85.0}
]
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
zip_path = tmp_path / "domestic-csv.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:

View file

@ -14,6 +14,7 @@ from pipeline.transform.merge import (
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_coalesce_direct_epc_columns,
_dedupe_collapsed_properties,
_filter_to_active_english_postcodes,
_join_area_side_tables,
_finalize_listings,
@ -193,6 +194,159 @@ def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() ->
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
def test_postcode_feature_validation_rejects_wrong_count() -> None:
# The universe-size invariant: the postcode feature output must contain
# EXACTLY the active-England universe. Too few rows (silently dropped
# postcodes) and too many / duplicated rows (a join fan-out) must both fail,
# so neither a truncated build nor a one-to-many join can ship.
too_few = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(too_few, expected_postcode_count=2)
too_many = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
"lat": [51.0, 52.0, 53.0],
"lon": [-0.1, -0.2, -0.3],
"ctry25cd": ["E92000001"] * 3,
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(too_many, expected_postcode_count=2)
# Right row count but a duplicated key (n_unique < height) -- the signature of
# a join fan-out.
duplicated = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "AA1 1AA"],
"lat": [51.0, 51.0],
"lon": [-0.1, -0.1],
"ctry25cd": ["E92000001", "E92000001"],
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(duplicated, expected_postcode_count=2)
def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
# Soundness: with side tables unique on their join key, the per-postcode
# feature joins emit exactly one row per postcode (no fan-out). A fan-out here
# would inflate the postcode universe above the active-England count -- the
# failure the universe assertion above is the backstop for.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"Serious crime (avg/yr)": [1.0, 2.0],
"Minor crime (avg/yr)": [3.0, 4.0],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(
{
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
}
),
).collect()
# One row per postcode in -> one row out; the universe is not inflated.
assert joined.height == 2
assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
# Broadband comes straight from Ofcom's CSV, so its postcode can drift in
# spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
# to the same canonical form so a real postcode populates
# `max_download_speed` instead of silently missing the left join.
base = pl.LazyFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
crime = pl.LazyFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"Serious crime (avg/yr)": [1.0, 2.0],
"Minor crime (avg/yr)": [3.0, 4.0],
}
)
# AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
# raw spellings that canonicalize to one key (the max speed must win, with
# no fan-out of the base row).
broadband = pl.LazyFrame(
{
"bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
"max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=broadband,
).collect()
# No fan-out: still one row per base postcode.
assert joined.height == 2
speeds = dict(
zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
)
# Spacing/casing drift still joins.
assert speeds["AB1 2CD"] == 300
# Two raw spellings collapse to one canonical key; the max wins.
assert speeds["EF3 4GH"] == 1000
# The temporary canonical join key is not leaked into the output schema.
assert "_base_canonical_postcode" not in joined.columns
assert "_bb_canonical_postcode" not in joined.columns
assert "bb_postcode" not in joined.columns
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -758,8 +912,10 @@ def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
# The crime table is LEFT-joined per postcode; a postcode absent from it
# must NOT be fabricated as "zero crime" (the safest value). When every
# per-type column is null the Serious/Minor rollups must stay null.
# must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
# rollups are precomputed in crime_spatial (the mean of the by-year rollup
# bars), so the merge reads them straight through; a missing postcode leaves
# them null.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
@ -772,7 +928,10 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
# rollup headlines are precomputed values (deliberately NOT the per-type sum,
# which would be 10.0 each) so this test proves the merge consumes the
# precomputed column rather than re-summing per-type columns.
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA"],
@ -790,6 +949,8 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
"Public order (avg/yr)": [1.0],
"Drugs (avg/yr)": [1.0],
"Other crime (avg/yr)": [1.0],
"Serious crime (avg/yr)": [7.5],
"Minor crime (avg/yr)": [4.2],
}
)
@ -805,7 +966,12 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
broadband=pl.LazyFrame(
{
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
}
),
).collect()
by_postcode = {
@ -814,14 +980,50 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
).iter_rows(named=True)
}
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
# Present postcode: rollups are the precomputed headline values, read through
# unchanged (NOT the per-type sum of 10.0).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
# Missing postcode: rollups stay null rather than fabricating 0.0.
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
# The terminated-postcode remap can merge two distinct postcodes onto one
# active successor, collapsing the same physical address onto a single
# (postcode, pp_address) key with conflicting sale records. The dedup must
# keep exactly one row per (postcode, pp_address) -- the most recent
# transaction -- and must not collapse genuinely distinct addresses.
from datetime import datetime
wide = pl.LazyFrame(
{
"postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
"pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
"date_of_transfer": [
datetime(1990, 1, 1),
datetime(2015, 6, 1),
datetime(2000, 1, 1),
],
"latest_price": [1_587_700, 4_500_000, 250_000],
}
)
out = _dedupe_collapsed_properties(wide).collect()
# One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
assert out.height == 2
assert out.select(["postcode", "pp_address"]).is_unique().all()
by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
# The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
# not an arbitrary one.
assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
# A genuinely distinct address in the same postcode is untouched.
assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",

View file

@ -0,0 +1,110 @@
import numpy as np
import rasterio
from rasterio.transform import from_origin
from rasterio.warp import transform_bounds
from pipeline.transform import noise_overlay_tiles
from pipeline.transform.noise_overlay_tiles import RasterInfo, _read_noise_tile
def _write_corridor_raster(path, nodata=-96.0):
"""A small EPSG:27700 raster: a column of 70 dB cells adjacent to genuine
0.0 (quiet) cells. Bilinear blending of the 0 cells would fabricate a halo
of intermediate dB values between 0 and 70."""
# 8x8 grid: leftmost two columns are 70 dB, the rest are genuine quiet 0.0.
data = np.zeros((8, 8), dtype=np.float32)
data[:, 0:2] = 70.0
# Place one true nodata cell to make sure it is also masked out.
data[0, 7] = nodata
# 10m cells anchored somewhere inside England's BNG extent.
left = 300_000.0
top = 300_080.0
transform = from_origin(left, top, 10.0, 10.0)
with rasterio.open(
path,
"w",
driver="GTiff",
height=data.shape[0],
width=data.shape[1],
count=1,
dtype=data.dtype,
crs="EPSG:27700",
transform=transform,
nodata=nodata,
) as dataset:
dataset.write(data, 1)
return path
def test_read_noise_tile_does_not_fabricate_halo(tmp_path):
raster_path = _write_corridor_raster(tmp_path / "corridor.tif")
with rasterio.open(raster_path) as dataset:
bounds_27700 = dataset.bounds
bounds_mercator = transform_bounds(
dataset.crs,
noise_overlay_tiles.WEB_MERCATOR_CRS,
*bounds_27700,
densify_pts=21,
)
info = RasterInfo(path=raster_path, bounds_mercator=bounds_mercator)
# Render at high resolution so any bilinear halo would surface as
# intermediate dB values along the corridor/quiet seam.
tile_size = 64
tile = _read_noise_tile([info], bounds_mercator, tile_size)
finite = tile[np.isfinite(tile)]
# Every finite cell must be the genuine corridor value (~70). There must be
# NO fabricated halo strictly between 0 and 70.
halo = finite[(finite > 0.0) & (finite < 70.0 - 1e-3)]
assert halo.size == 0, f"fabricated halo values present: {np.unique(halo)}"
# Sanity: the corridor itself must still be rendered.
assert finite.size > 0
assert np.all(finite >= 70.0 - 1e-3)
def test_read_noise_tile_preserves_peak_under_downsample(tmp_path):
# 8x8 EPSG:27700 raster: a single loud 75 dB cell in a 50 dB field.
# Downsampling into a smaller tile with bilinear would dilute the peak
# (arithmetic dB averaging); Resampling.max must keep the worst-case dB.
data = np.full((8, 8), 50.0, dtype=np.float32)
data[4, 4] = 75.0
transform = from_origin(300_000.0, 300_080.0, 10.0, 10.0)
raster_path = tmp_path / "peak.tif"
with rasterio.open(
raster_path,
"w",
driver="GTiff",
height=data.shape[0],
width=data.shape[1],
count=1,
dtype=data.dtype,
crs="EPSG:27700",
transform=transform,
nodata=-96.0,
) as dataset:
dataset.write(data, 1)
with rasterio.open(raster_path) as dataset:
bounds_mercator = transform_bounds(
dataset.crs,
noise_overlay_tiles.WEB_MERCATOR_CRS,
*dataset.bounds,
densify_pts=21,
)
info = RasterInfo(path=raster_path, bounds_mercator=bounds_mercator)
# Render the 8x8 source into a 4x4 tile: this downsamples, so bilinear
# would average the 75 dB peak away.
tile = _read_noise_tile([info], bounds_mercator, 4)
finite = tile[np.isfinite(tile)]
assert finite.size > 0
# The loud peak must survive the downsample (max, not arithmetic mean).
assert finite.max() >= 75.0 - 1e-3, f"peak diluted to {finite.max()}"
# Max resampling must never invent a value louder than the source.
assert finite.max() <= 75.0 + 1e-3

View file

@ -1,12 +1,115 @@
import json
import polars as pl
from pipeline.transform.transform_poi import (
_load_ofsted_ratings,
_school_icon_category_expr,
transform,
transform_grocery_retail_points,
)
def _write_boundary(tmp_path):
"""A FeatureCollection whose single feature covers the London-area test
coords used by the transform() fixtures, so in_england_mask keeps them."""
boundary_path = tmp_path / "england.geojson"
coords = [[-1.0, 51.0], [1.0, 51.0], [1.0, 52.0], [-1.0, 52.0], [-1.0, 51.0]]
boundary_path.write_text(
json.dumps(
{
"type": "FeatureCollection",
"features": [
{
"type": "Feature",
"properties": {},
"geometry": {"type": "Polygon", "coordinates": [coords]},
}
],
}
)
)
return boundary_path
def _write_transform_inputs(tmp_path, raw_pois: pl.DataFrame):
"""Materialise the parquet inputs transform() requires around a given raw
OSM POIs frame. NaPTAN / grocery / GIAS / Ofsted are minimal but valid."""
input_path = tmp_path / "pois.parquet"
raw_pois.write_parquet(input_path)
naptan_path = tmp_path / "naptan.parquet"
pl.DataFrame(
{
"id": ["naptan-1"],
"name": ["Test Rail Station"],
"category": ["Rail station"],
"lat": [51.51],
"lng": [-0.13],
}
).write_parquet(naptan_path)
grocery_path = tmp_path / "grocery.parquet"
pl.DataFrame(
{
"id": list(range(1, 6)),
"retailer": ["Tesco"] * 5,
"fascia": ["Tesco"] * 5,
"store_name": [f"Tesco Test {i}" for i in range(1, 6)],
"long_wgs": [-0.14] * 5,
"lat_wgs": [51.52] * 5,
}
).write_parquet(grocery_path)
gias_path = tmp_path / "gias.parquet"
pl.DataFrame(
{
"urn": [1001],
"name": ["Test Primary School"],
"phase": ["Primary"],
"type": ["Community school"],
"type_group": ["Local authority maintained schools"],
"age_range": ["411"],
"gender": ["Mixed"],
"religious_character": [None],
"admissions_policy": ["Comprehensive"],
"nursery_provision": ["No"],
"sixth_form": ["No"],
"capacity": [200],
"pupils": [180],
"fsm_percent": [12.5],
"trust": [None],
"address": ["1 Test Street"],
"postcode": ["E1 1AA"],
"local_authority": ["Test LA"],
"website": [None],
"telephone": ["02012345678"],
"head_name": ["Jane Doe"],
"lat": [51.53],
"lng": [-0.12],
}
).write_parquet(gias_path)
ofsted_path = tmp_path / "ofsted.parquet"
pl.DataFrame(
{
"URN": [1001],
"Latest OEIF overall effectiveness": ["2"],
"Ungraded inspection overall outcome": [None],
}
).write_parquet(ofsted_path)
boundary_path = _write_boundary(tmp_path)
return {
"input_path": input_path,
"naptan_path": naptan_path,
"boundary_path": boundary_path,
"grocery_retail_points_path": grocery_path,
"gias_path": gias_path,
"ofsted_path": ofsted_path,
}
def test_transform_grocery_retail_points_outputs_chain_categories():
raw = pl.DataFrame(
{
@ -292,3 +395,79 @@ def test_school_icon_category_handles_one_sided_age_ranges():
"Primary school",
"School",
]
def test_transform_dedupes_multi_tag_pois(tmp_path):
# One OSM object can carry several tag keys that map to the SAME friendly
# category, so pois.py emits one raw row per key with the SAME id.
# "amenity/pharmacy" and "shop/chemist" both map to "Pharmacy".
raw = pl.DataFrame(
{
"id": ["n42", "n42"],
"name": ["Boots", "Boots"],
"category": ["amenity/pharmacy", "shop/chemist"],
"lat": [51.50, 51.50],
"lng": [-0.10, -0.10],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
# No (id, category) pair appears more than once.
assert out.group_by("id", "category").len()["len"].max() == 1
# The single physical pharmacy is present exactly once.
pharmacies = out.filter(
(pl.col("id") == "n42") & (pl.col("category") == "Pharmacy")
)
assert pharmacies.height == 1
def test_osm_supermarkets_dropped(tmp_path):
# GEOLYTIX is authoritative for supermarkets; an OSM "shop/supermarket" row
# must not flow through as a second Groceries/Supermarket pin. A
# complementary grocery category (Convenience Store) must still survive.
raw = pl.DataFrame(
{
"id": ["n1", "n2"],
"name": ["Some Supermarket", "Corner Shop"],
"category": ["shop/supermarket", "shop/convenience"],
"lat": [51.50, 51.51],
"lng": [-0.10, -0.11],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
osm_supermarkets = out.filter(
(pl.col("group") == "Groceries") & (pl.col("category") == "Supermarket")
)
assert osm_supermarkets.height == 0
# Complementary OSM grocery category survives.
convenience = out.filter(pl.col("category") == "Convenience Store")
assert convenience.height == 1
def test_transform_output_unique_per_id_category(tmp_path):
# Soundness: the full transform() output has at most one row per
# (id, category) overall, across every source.
raw = pl.DataFrame(
{
"id": ["n42", "n42", "n7", "n8"],
"name": ["Boots", "Boots", "St Mary's", "St Mary's"],
"category": [
"amenity/pharmacy",
"shop/chemist",
"amenity/place_of_worship",
"building/church",
],
"lat": [51.50, 51.50, 51.55, 51.55],
"lng": [-0.10, -0.10, -0.15, -0.15],
}
)
inputs = _write_transform_inputs(tmp_path, raw)
out = transform(**inputs).collect()
assert out.group_by("id", "category").len()["len"].max() == 1

View file

@ -6,6 +6,10 @@ import polars as pl
from pipeline.utils.england_geometry import in_england_mask
DROP_CATEGORIES = {
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
# (transform_grocery_retail_points), so drop OSM supermarkets to avoid
# double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
"shop/supermarket",
# Street furniture & infrastructure
"amenity/advice",
"amenity/atm",
@ -364,14 +368,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"leisure/yes",
],
),
(
"Groceries",
"Supermarket",
"🛒",
[
"shop/supermarket",
],
),
(
"Groceries",
"Convenience Store",
@ -1534,6 +1530,14 @@ def transform(
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
)
# A single OSM object can carry several tag keys that map to the same
# friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
# which pois.py emits as multiple raw rows sharing one id. Collapse those
# duplicates so they don't inflate downstream proximity counts; rows sharing
# an id with DIFFERENT categories are preserved. Other sources are
# pre-deduplicated.
lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)
naptan_df = pl.scan_parquet(naptan_path).collect()
mask = in_england_mask(
boundary_path,