idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -273,6 +273,28 @@ def _write_avg_yr(
|
|||
for type_idx, name in enumerate(ALL_CRIME_TYPES):
|
||||
data[f"{name} (avg/yr)"] = avg[:, type_idx]
|
||||
|
||||
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
|
||||
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
|
||||
# average over the years in which ANY of those types occurred. This keeps the
|
||||
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
|
||||
# Summing the per-type avg/yr values instead (as the merge previously did)
|
||||
# divides each type by its OWN years-present and overstates the rollup when a
|
||||
# postcode's serious/minor types occur in disjoint years.
|
||||
for rollup_name, rollup_types in (
|
||||
("Serious crime", SERIOUS_CRIME_TYPES),
|
||||
("Minor crime", MINOR_CRIME_TYPES),
|
||||
):
|
||||
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
|
||||
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
|
||||
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
|
||||
rollup_years_present = np.clip(
|
||||
(rollup_counts > 0).sum(axis=1), 1, None
|
||||
).astype(np.float64)
|
||||
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
|
||||
np.float32
|
||||
)
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
|
||||
print(f"Wrote postcode crime averages: {output_path}")
|
||||
|
|
|
|||
|
|
@ -106,7 +106,14 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
|||
.alias("potential_energy_rating"),
|
||||
_clean_string("property_type").alias("epc_property_type"),
|
||||
_clean_string("built_form").alias("built_form"),
|
||||
_clean_string("inspection_date").alias("inspection_date"),
|
||||
# Parse to a real Date once (unparseable/blank -> null) so dedup can
|
||||
# sort newest-first with nulls_last and _event_year can use dt.year();
|
||||
# a lexicographic string sort would let a null/garbled date win under
|
||||
# Polars' default nulls-first descending order. EPC inspection dates
|
||||
# are ISO (YYYY-MM-DD).
|
||||
_clean_string("inspection_date")
|
||||
.str.to_date(format="%Y-%m-%d", strict=False)
|
||||
.alias("inspection_date"),
|
||||
_clean_number("total_floor_area", pl.Float64).alias("total_floor_area"),
|
||||
_clean_number("number_habitable_rooms", pl.Int16).alias(
|
||||
"number_habitable_rooms"
|
||||
|
|
@ -247,9 +254,11 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
|
||||
)
|
||||
|
||||
# Dedup fork: keep latest certificate per property (existing logic)
|
||||
# Dedup fork: keep latest certificate per property. inspection_date is a typed
|
||||
# Date (see _select_epc_columns); nulls_last keeps a real-dated cert ahead of a
|
||||
# null/unparseable-dated one so the genuinely newest certificate is chosen.
|
||||
epc = (
|
||||
epc_base.sort("inspection_date", descending=True)
|
||||
epc_base.sort("inspection_date", descending=True, nulls_last=True)
|
||||
.group_by("_epc_match_address", "_epc_match_postcode")
|
||||
.first()
|
||||
.drop("tenure")
|
||||
|
|
@ -303,11 +312,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
)
|
||||
.filter(pl.col("_event").is_not_null())
|
||||
.with_columns(
|
||||
pl.col("inspection_date")
|
||||
.cast(pl.String)
|
||||
.str.slice(0, 4)
|
||||
.cast(pl.Int32)
|
||||
.alias("_event_year"),
|
||||
pl.col("inspection_date").dt.year().cast(pl.Int32).alias("_event_year"),
|
||||
)
|
||||
.group_by("_epc_match_address", "_epc_match_postcode")
|
||||
.agg(
|
||||
|
|
|
|||
|
|
@ -807,6 +807,22 @@ def _remap_terminated_postcodes(
|
|||
)
|
||||
|
||||
|
||||
def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Keep one row per (postcode, pp_address) — the most-recent transaction.
|
||||
|
||||
The terminated-postcode remap can map two distinct postcodes onto one active
|
||||
successor, collapsing the same physical address onto a single
|
||||
(postcode, pp_address) key with conflicting sale records. Keep the row with
|
||||
the latest date_of_transfer so the headline price/date reflect the most
|
||||
recent transaction; genuinely distinct addresses (a different pp_address) are
|
||||
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
|
||||
never merges unrelated rows.
|
||||
"""
|
||||
return wide.sort(
|
||||
"date_of_transfer", descending=True, nulls_last=True
|
||||
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
|
||||
|
||||
|
||||
def _filter_to_active_english_postcodes(
|
||||
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
|
||||
) -> pl.LazyFrame:
|
||||
|
|
@ -837,38 +853,19 @@ def _join_area_side_tables(
|
|||
)
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
base = base.join(crime, on="postcode", how="left")
|
||||
serious_crime_cols = [
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
]
|
||||
minor_crime_cols = [
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
]
|
||||
# The LEFT join leaves every per-type column null for postcodes absent from
|
||||
# the crime table; sum_horizontal alone would fabricate a "zero crime"
|
||||
# rollup there, so keep the rollup null when ALL components are null.
|
||||
base = base.with_columns(
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(serious_crime_cols))
|
||||
.alias("serious_crime_avg_yr"),
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(minor_crime_cols))
|
||||
.alias("minor_crime_avg_yr"),
|
||||
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
|
||||
# precomputes the Serious/Minor headline rollups as the mean of the by-year
|
||||
# rollup bars; read those straight through (renamed to the internal columns
|
||||
# _finalize_merged_columns expects) rather than re-summing the per-type
|
||||
# avg/yr columns — summing divides each type by its OWN years-present and
|
||||
# overstates the rollup when types differ in coverage. A postcode absent from
|
||||
# the crime table keeps null rollups via the left join (no fabricated zero);
|
||||
# the per-type avg/yr columns pass through unchanged for display.
|
||||
base = base.join(crime, on="postcode", how="left").rename(
|
||||
{
|
||||
"Serious crime (avg/yr)": "serious_crime_avg_yr",
|
||||
"Minor crime (avg/yr)": "minor_crime_avg_yr",
|
||||
}
|
||||
)
|
||||
|
||||
base = base.join(median_age, on="lsoa21", how="left")
|
||||
|
|
@ -881,7 +878,37 @@ def _join_area_side_tables(
|
|||
)
|
||||
if tree_density is not None:
|
||||
base = base.join(tree_density, on="postcode", how="left")
|
||||
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
|
||||
# Broadband is the one side table sourced straight from a third-party CSV
|
||||
# (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
|
||||
# step, so its postcode may drift in spacing/casing from the NSPL `pcds`
|
||||
# base key. Normalize BOTH sides to the same canonical pcds form (reusing
|
||||
# `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
|
||||
# before joining, otherwise a real postcode silently misses and its
|
||||
# `max_download_speed` reads as null "no data" downstream. Re-aggregate on
|
||||
# the canonical key so two raw spellings collapsing to one key can't fan out
|
||||
# the base; drop a null canonical key so an unparseable Ofcom row joins
|
||||
# nothing rather than matching a null-key base row.
|
||||
broadband_canonical = (
|
||||
broadband.with_columns(
|
||||
_canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
|
||||
)
|
||||
.drop_nulls("_bb_canonical_postcode")
|
||||
.group_by("_bb_canonical_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
)
|
||||
return (
|
||||
base.with_columns(
|
||||
_canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
|
||||
)
|
||||
.join(
|
||||
broadband_canonical,
|
||||
left_on="_base_canonical_postcode",
|
||||
right_on="_bb_canonical_postcode",
|
||||
how="left",
|
||||
)
|
||||
.drop("_base_canonical_postcode")
|
||||
)
|
||||
|
||||
|
||||
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
|
||||
|
|
@ -1328,7 +1355,7 @@ def _load_direct_epc_candidates(
|
|||
)
|
||||
|
||||
return (
|
||||
epc_base.sort("inspection_date", descending=True)
|
||||
epc_base.sort("inspection_date", descending=True, nulls_last=True)
|
||||
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
|
||||
.first()
|
||||
.join(
|
||||
|
|
@ -1918,6 +1945,10 @@ def _build(
|
|||
# terminated English postcodes are retained under their successor postcode.
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path)
|
||||
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
|
||||
# The remap can collapse two terminated postcodes onto one active successor,
|
||||
# duplicating a physical address's (postcode, pp_address) key; keep only the
|
||||
# most-recent transaction per address before the per-postcode joins.
|
||||
wide = _dedupe_collapsed_properties(wide)
|
||||
arcgis_raw = pl.scan_parquet(arcgis_path)
|
||||
arcgis = _active_english_postcode_area(arcgis_raw)
|
||||
active_postcodes = arcgis.select("postcode").unique()
|
||||
|
|
|
|||
|
|
@ -164,19 +164,39 @@ def _read_noise_tile(
|
|||
|
||||
for info in candidates:
|
||||
with rasterio.open(info.path) as source:
|
||||
# The Defra rasters encode genuine "quiet / below threshold" as the
|
||||
# value 0.0 (only -96.0 is true nodata). Mask both BEFORE
|
||||
# reprojecting so resampling never blends a 0 cell into an adjacent
|
||||
# loud corridor and fabricates a halo of intermediate dB.
|
||||
#
|
||||
# Lden values are dB (a logarithmic scale), so bilinear resampling
|
||||
# would arithmetically average neighbouring dB cells, which is
|
||||
# acoustically wrong (it diluted a 75 dB peak to ~53 dB in tests)
|
||||
# and inconsistent with the postcode sampler. Use Resampling.max:
|
||||
# it preserves peak corridors, never invents an intermediate dB
|
||||
# between a masked (NaN) quiet cell and a loud one, and mirrors the
|
||||
# max semantics of sample_noise_at_postcodes.
|
||||
src_arr = source.read(1).astype(np.float32)
|
||||
nodata = source.nodata
|
||||
invalid = ~np.isfinite(src_arr) | (src_arr <= 0)
|
||||
if nodata is not None:
|
||||
invalid |= np.isclose(
|
||||
src_arr, np.float32(nodata), rtol=1e-5, atol=1e-5
|
||||
)
|
||||
src_arr = np.where(invalid, np.float32("nan"), src_arr)
|
||||
tile = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
|
||||
reproject(
|
||||
source=rasterio.band(source, 1),
|
||||
source=src_arr,
|
||||
destination=tile,
|
||||
src_transform=source.transform,
|
||||
src_crs=source.crs,
|
||||
src_nodata=source.nodata if source.nodata is not None else 0,
|
||||
src_nodata=float("nan"),
|
||||
dst_transform=from_bounds(
|
||||
left, bottom, right, top, tile_size, tile_size
|
||||
),
|
||||
dst_crs=WEB_MERCATOR_CRS,
|
||||
dst_nodata=np.nan,
|
||||
resampling=Resampling.bilinear,
|
||||
resampling=Resampling.max,
|
||||
)
|
||||
|
||||
tile[~np.isfinite(tile) | (tile <= 0)] = np.nan
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ from .output import (
|
|||
to_wgs84_geojson_multi,
|
||||
write_district_geojson,
|
||||
)
|
||||
from .process_oa import _extract_polygonal, process_oa
|
||||
from .process_oa import MIN_GEOM_AREA, _extract_polygonal, process_oa
|
||||
from .uprn import get_oa_uprns, load_uprns
|
||||
from .voronoi import _equal_split_fallback, compute_voronoi_regions
|
||||
|
||||
|
|
@ -341,6 +341,65 @@ class TestVoronoiDeduplication:
|
|||
assert "B" in result, "Postcode B missing with int64 coords"
|
||||
|
||||
|
||||
class TestVoronoiCoincidentClusterNotCrushed:
|
||||
"""3+ postcodes at one coordinate must each keep a real cell.
|
||||
|
||||
Pre-fix, the first coincident postcode stayed unjittered at the exact
|
||||
cluster centre; with other seeds in the OA its Voronoi cell was squeezed
|
||||
below MIN_GEOM_AREA, so _clean_polygonal dropped that active postcode
|
||||
downstream. The fix spreads coincident postcodes onto a small regular
|
||||
polygon (equal wedges), so none is crushed.
|
||||
"""
|
||||
|
||||
def test_coincident_cluster_plus_outer_seed_no_postcode_crushed(self):
|
||||
# A block of flats: 4 distinct postcodes share one building coordinate,
|
||||
# plus one other postcode elsewhere in the OA. Pre-fix, the centre seed's
|
||||
# cell collapsed to ~0.0001 m^2 (< MIN_GEOM_AREA) and the postcode was
|
||||
# dropped; every postcode must now keep a non-degenerate cell.
|
||||
boundary = box(0, 0, 1000, 1000)
|
||||
points = np.array(
|
||||
[
|
||||
[500, 500], # A — coincident
|
||||
[500, 500], # B — coincident
|
||||
[500, 500], # C — coincident
|
||||
[500, 500], # D — coincident
|
||||
[100, 100], # OUT — elsewhere in the OA
|
||||
],
|
||||
dtype=np.float64,
|
||||
)
|
||||
postcodes = ["A", "B", "C", "D", "OUT"]
|
||||
result = compute_voronoi_regions(points, postcodes, boundary)
|
||||
for pc in postcodes:
|
||||
assert pc in result, f"Postcode {pc} was dropped"
|
||||
assert result[pc].area > MIN_GEOM_AREA, (
|
||||
f"Postcode {pc} cell {result[pc].area} <= MIN_GEOM_AREA"
|
||||
)
|
||||
|
||||
def test_coincident_cluster_partitions_into_fair_wedges(self, square_boundary):
|
||||
# N postcodes sharing one coordinate split the surrounding area into
|
||||
# roughly equal wedges (regular-polygon seeds), none degenerate.
|
||||
points = np.array([[500050, 180050]] * 5, dtype=np.float64)
|
||||
postcodes = ["A", "B", "C", "D", "E"]
|
||||
result = compute_voronoi_regions(points, postcodes, square_boundary)
|
||||
fair_share = square_boundary.area / len(postcodes)
|
||||
for pc in postcodes:
|
||||
assert pc in result, f"Postcode {pc} was dropped"
|
||||
# Each wedge is a meaningful fraction of its fair share (not crushed).
|
||||
assert result[pc].area > 0.3 * fair_share, (
|
||||
f"Postcode {pc} cell {result[pc].area} far below fair share {fair_share}"
|
||||
)
|
||||
|
||||
def test_two_coincident_split_is_fair(self, square_boundary):
|
||||
"""Regression: two postcodes at one coordinate split ~50/50."""
|
||||
points = np.array([[500050, 180050], [500050, 180050]], dtype=np.float64)
|
||||
postcodes = ["A", "B"]
|
||||
result = compute_voronoi_regions(points, postcodes, square_boundary)
|
||||
assert "A" in result and "B" in result
|
||||
total = result["A"].area + result["B"].area
|
||||
assert result["A"].area / total > 0.4
|
||||
assert result["B"].area / total > 0.4
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bug 4: Voronoi collinear fallback gives everything to first postcode
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -20,33 +20,48 @@ def compute_voronoi_regions(
|
|||
# Convert to float64 so sub-metre jitter isn't truncated.
|
||||
points = points.astype(np.float64)
|
||||
|
||||
# Deduplicate points, keeping one per (location, postcode) pair.
|
||||
# Multiple postcodes at the same coordinate each get their own point,
|
||||
# jittered by a tiny offset (0.01m) so Voronoi can distinguish them.
|
||||
# Coords are rounded to mm precision for stable hashing — UPRN inputs are
|
||||
# already integer metres, but the float64 cast can introduce ULP noise.
|
||||
GOLDEN_ANGLE = np.pi * (3.0 - np.sqrt(5.0))
|
||||
# Deduplicate points, keeping one per (location, postcode) pair. Coords are
|
||||
# rounded to mm precision for stable hashing — UPRN inputs are already integer
|
||||
# metres, but the float64 cast can introduce ULP noise.
|
||||
#
|
||||
# Where several DISTINCT postcodes share one coordinate, jitter ALL of them
|
||||
# onto a small regular polygon (equal 0.01m radius, equally spaced by angle)
|
||||
# so their Voronoi cells become equal wedges and NONE is crushed. Leaving any
|
||||
# seed at the centre — or innermost on a spiral — squeezes its cell below
|
||||
# MIN_GEOM_AREA, which _clean_polygonal then drops downstream, silently losing
|
||||
# an active postcode. Seeds at a UNIQUE coordinate are left exactly on their
|
||||
# UPRN (no perturbation of normal Voronoi output). Coords are rounded to mm
|
||||
# for stable hashing (the float64 cast can add ULP noise).
|
||||
rounded_coords = [
|
||||
(round(float(points[i, 0]), 3), round(float(points[i, 1]), 3))
|
||||
for i in range(len(points))
|
||||
]
|
||||
coord_postcodes: dict[tuple[float, float], set[str]] = defaultdict(set)
|
||||
for coord, pc in zip(rounded_coords, postcodes):
|
||||
coord_postcodes[coord].add(pc)
|
||||
|
||||
seen: dict[tuple[float, float, str], bool] = {}
|
||||
unique_pts = []
|
||||
unique_pcs = []
|
||||
coord_counts: dict[tuple[float, float], int] = defaultdict(int)
|
||||
for i in range(len(points)):
|
||||
coord = (round(float(points[i, 0]), 3), round(float(points[i, 1]), 3))
|
||||
coord = rounded_coords[i]
|
||||
key = (coord[0], coord[1], postcodes[i])
|
||||
if key not in seen:
|
||||
seen[key] = True
|
||||
jitter_idx = coord_counts[coord]
|
||||
coord_counts[coord] += 1
|
||||
if jitter_idx == 0:
|
||||
unique_pts.append(points[i].copy())
|
||||
else:
|
||||
# Golden-angle spacing distributes any number of jittered
|
||||
# points evenly around (and outward from) the original coord.
|
||||
count = len(coord_postcodes[coord])
|
||||
if count > 1:
|
||||
# Coincident cluster: equally-spaced regular polygon -> equal
|
||||
# Voronoi wedges, so every postcode here keeps a fair share.
|
||||
jitter_idx = coord_counts[coord]
|
||||
coord_counts[coord] += 1
|
||||
angle = 2.0 * np.pi * jitter_idx / count
|
||||
jittered = points[i].copy()
|
||||
angle = jitter_idx * GOLDEN_ANGLE
|
||||
jittered[0] += 0.01 * np.cos(angle)
|
||||
jittered[1] += 0.01 * np.sin(angle)
|
||||
unique_pts.append(jittered)
|
||||
else:
|
||||
unique_pts.append(points[i].copy())
|
||||
unique_pcs.append(postcodes[i])
|
||||
|
||||
if len(unique_pts) == 1:
|
||||
|
|
|
|||
|
|
@ -19,8 +19,7 @@ from tqdm import tqdm
|
|||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
)
|
||||
|
|
@ -169,33 +168,47 @@ def solve_robust_index(
|
|||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
# Temporal smoothness prior: penalise curvature in the year betas with a
|
||||
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
|
||||
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
|
||||
# zero target). This damps single-year index spikes without flattening
|
||||
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
|
||||
# year (min_year, implicit beta=0) has no column, so the penalty spans the
|
||||
# non-baseline years only. For cells with <3 betas there is no curvature to
|
||||
# penalise and the solve is unchanged.
|
||||
# second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
|
||||
# least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
|
||||
# The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
|
||||
# for the consecutive triple (y0, y1, y2), so gap years are not treated as
|
||||
# adjacent: a multi-year gap relaxes the penalty (correctly preserving a
|
||||
# genuine level jump) instead of forcing a smooth ramp. For unit spacing
|
||||
# (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
|
||||
# This damps single-year index spikes without flattening genuine trends.
|
||||
# Betas are ordered by calendar year; the baseline year (min_year, implicit
|
||||
# beta=0) has no column, so the penalty spans the non-baseline years only.
|
||||
# For cells with <3 betas there is no curvature to penalise and the solve is
|
||||
# unchanged.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cols_by_year = [c for _, c in sorted(year_to_col.items())]
|
||||
years_sorted = sorted(year_to_col)
|
||||
cols_by_year = [year_to_col[y] for y in years_sorted]
|
||||
n_pen = n_cols - 2
|
||||
pen_rows = np.repeat(n + np.arange(n_pen), 3)
|
||||
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
|
||||
pen_vals = np.empty(n_pen * 3, dtype=np.float64)
|
||||
for k in range(n_pen):
|
||||
pen_cols[3 * k : 3 * k + 3] = (
|
||||
cols_by_year[k],
|
||||
cols_by_year[k + 1],
|
||||
cols_by_year[k + 2],
|
||||
)
|
||||
y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
|
||||
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
|
||||
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
|
||||
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
|
||||
pen_vals[3 * k : 3 * k + 3] = (
|
||||
sqrt_lambda * w0,
|
||||
sqrt_lambda * w1,
|
||||
sqrt_lambda * w2,
|
||||
)
|
||||
pen_rows_arr = pen_rows.astype(np.int64)
|
||||
pen_cols_arr = pen_cols
|
||||
pen_vals_arr = np.tile(
|
||||
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
|
||||
).astype(np.float64)
|
||||
pen_vals_arr = pen_vals
|
||||
pen_b = np.zeros(n_pen, dtype=np.float64)
|
||||
n_total_rows = n + n_pen
|
||||
|
||||
|
|
@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
|||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs[key] = len(y1)
|
||||
# Count only information-bearing pairs: same-year (year1==year2) and
|
||||
# baseline-baseline pairs cancel in the sparse solve and contribute
|
||||
# zero information to the annual index, so including them would
|
||||
# inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
|
||||
n_pairs[key] = int(np.count_nonzero(y2 != y1))
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
|
|
@ -433,20 +450,17 @@ def build_index(
|
|||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Re-anchor every repeat-sales dict to the global base year before any
|
||||
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
|
||||
# log-index 0 at its OWN earliest year, so cells with shorter histories
|
||||
# are measured from a later origin; combining them key-by-key would
|
||||
# otherwise average level-incompatible numbers. The hedonic fallback is
|
||||
# already anchored at min_year, so we align everything to min_year.
|
||||
national_idx = reanchor_dict(national_idx, min_year)
|
||||
area_idx = reanchor_dicts(area_idx, min_year)
|
||||
district_idx = reanchor_dicts(district_idx, min_year)
|
||||
sector_idx = reanchor_dicts(sector_idx, min_year)
|
||||
|
||||
# Shrinkage: national -> hedonic first, then hierarchical
|
||||
# Shrinkage: national -> hedonic first, then hierarchical. Each cell is
|
||||
# anchored to log-index 0 at its OWN earliest year (solve_robust_index),
|
||||
# so cells with shorter histories sit on a later origin than their wider
|
||||
# parents. Before each blend we lift the child onto its parent's base at
|
||||
# the child's first year (lift_onto_parent) -- otherwise combining them
|
||||
# key-by-key averages level-incompatible numbers. The hedonic fallback is
|
||||
# anchored at the global min_year, so it serves as the base for national.
|
||||
print(" Applying shrinkage...")
|
||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||
national_shrunk = shrink_dicts(
|
||||
lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
|
||||
)
|
||||
sector_shrunk = hierarchical_shrinkage(
|
||||
sector_idx,
|
||||
sector_n,
|
||||
|
|
@ -459,6 +473,7 @@ def build_index(
|
|||
sector_to_dist,
|
||||
dist_to_area,
|
||||
shrink_dicts,
|
||||
lift_onto_parent,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
|
|
|
|||
|
|
@ -142,6 +142,20 @@ def _sale_identity_matches(
|
|||
target_price: float,
|
||||
target_sale_date: int,
|
||||
) -> np.ndarray:
|
||||
"""Mark pool comparables that are (almost certainly) the target's own sale.
|
||||
|
||||
properties.parquet has no per-property id, so a sale is identified by the
|
||||
proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
|
||||
prior sale out of its comparable set (leakage prevention).
|
||||
|
||||
Limitation: new-build / bulk blocks sell many DISTINCT properties in one
|
||||
postcode on the same day at the same price, so all such siblings collide on
|
||||
this proxy and are excluded together. This is intentional conservative
|
||||
over-exclusion: it guarantees no leakage at the cost of occasionally
|
||||
dropping legitimate same-(postcode, price, date) siblings. The effect is
|
||||
bounded (~1.8% of the pool) and a precise fix would require a per-property
|
||||
id that the data does not carry.
|
||||
"""
|
||||
if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
|
||||
return np.zeros(len(pool_postcodes), dtype=bool)
|
||||
return (
|
||||
|
|
@ -166,6 +180,16 @@ def knn_median_psm(
|
|||
|
||||
PSM is at the reference date used when building the pool.
|
||||
NaN where not computable (missing coords, unknown type, too few neighbors).
|
||||
|
||||
Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
|
||||
postcode), so every property within a postcode is co-located. For a dense
|
||||
postcode the "k nearest" therefore degenerates into an arbitrary
|
||||
same-postcode subset whose membership is decided by KDTree index order
|
||||
rather than true proximity. No property-level coordinates exist to fix this,
|
||||
so the kNN signal is treated as a weak, noisy prior: the downstream guarded
|
||||
blend (guarded_blend_estimates) only blends kNN when it is close to the
|
||||
index estimate and otherwise discards it, bounding the impact of this
|
||||
degeneracy. The result is deterministic for a fixed pool order.
|
||||
"""
|
||||
n = len(lat)
|
||||
result = np.full(n, np.nan)
|
||||
|
|
|
|||
|
|
@ -36,26 +36,43 @@ def _base_value(index: dict[int, float], base_year: int) -> float:
|
|||
return index[prior[-1]]
|
||||
|
||||
|
||||
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
|
||||
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
|
||||
def lift_onto_parent(
|
||||
child: dict[int, float], parent: dict[int, float]
|
||||
) -> dict[int, float]:
|
||||
"""Lift a child index onto its parent's base before blending the two.
|
||||
|
||||
Subtracting the same constant from every year preserves all within-dict
|
||||
year-to-year differences, so estimate.py's (current - sale) semantics are
|
||||
unchanged; it only fixes the cross-dict level mismatch before blending.
|
||||
solve_robust_index anchors every cell to log-index 0 at its OWN earliest
|
||||
year, so a cell with a shorter history sits on a later origin than its
|
||||
(wider) parent. Combining them key-by-key would average level-incompatible
|
||||
numbers (a sector measured from 2008 blended with a district measured from
|
||||
1996). We add the parent's accumulated level at the child's first year, so
|
||||
``child[start] == parent[start]``: the child's own year-to-year moves are
|
||||
layered on top of the parent's growth up to that point -- the same
|
||||
assumption shrinkage already makes for years the child lacks.
|
||||
|
||||
Re-basing on each cell's OWN earliest year (rather than the global base,
|
||||
which the child cannot observe) is what makes this effective: subtracting
|
||||
the child's value at the global base is always 0 and changes nothing.
|
||||
|
||||
The shift is a single constant added to every year of the child, so the
|
||||
child's own year-to-year differences are preserved. PRECONDITION for the
|
||||
downstream estimate to be unaffected within the child's range: the parent's
|
||||
year coverage must be a superset of the child's. This holds throughout
|
||||
build_index, where each parent aggregates a superset of its children's sale
|
||||
pairs, so shrink_dicts blends every child year against a present parent year
|
||||
and the constant shift cancels in a within-range (current - sale) difference;
|
||||
only comparisons that span the child's start year (e.g. a sale predating the
|
||||
cell's own data) change. If a caller violates the precondition (a child year
|
||||
the parent lacks), shrink_dicts passes that year through unshrunk and the
|
||||
cancellation no longer holds.
|
||||
"""
|
||||
if not index:
|
||||
return index
|
||||
shift = _base_value(index, base_year)
|
||||
if shift == 0.0:
|
||||
return index
|
||||
return {y: v - shift for y, v in index.items()}
|
||||
|
||||
|
||||
def reanchor_dicts(
|
||||
indices: dict[str, dict[int, float]], base_year: int
|
||||
) -> dict[str, dict[int, float]]:
|
||||
"""Re-anchor every index dict in a mapping to the common `base_year`."""
|
||||
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
|
||||
if not child or not parent:
|
||||
return child
|
||||
child_start = min(child)
|
||||
offset = _base_value(parent, child_start) - child[child_start]
|
||||
if offset == 0.0:
|
||||
return child
|
||||
return {y: v + offset for y, v in child.items()}
|
||||
|
||||
|
||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||
|
|
@ -84,30 +101,40 @@ def hierarchical_shrinkage(
|
|||
sector_to_dist: dict[str, str],
|
||||
dist_to_area: dict[str, str],
|
||||
shrink_fn: Callable[[V, V, int], V],
|
||||
lift_fn: Callable[[V, V], V] | None = None,
|
||||
) -> dict[str, V]:
|
||||
"""Top-down hierarchical shrinkage: area->top, district->area, sector->district.
|
||||
|
||||
`top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
|
||||
or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
|
||||
`lift_fn(raw, parent)`, if given, re-bases raw onto its parent before blending
|
||||
(see lift_onto_parent); pass None for category-keyed dicts where re-basing is
|
||||
meaningless.
|
||||
"""
|
||||
|
||||
def combine(raw: V, parent: V, n: int) -> V:
|
||||
if lift_fn is not None:
|
||||
raw = lift_fn(raw, parent)
|
||||
return shrink_fn(raw, parent, n)
|
||||
|
||||
# Area -> top level
|
||||
area_shrunk = {}
|
||||
for area, val in area_vals.items():
|
||||
area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
|
||||
area_shrunk[area] = combine(val, top_level, area_n[area])
|
||||
|
||||
# District -> area
|
||||
district_shrunk = {}
|
||||
for dist, val in district_vals.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, top_level)
|
||||
district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
|
||||
district_shrunk[dist] = combine(val, parent, district_n[dist])
|
||||
|
||||
# Sector -> district
|
||||
sector_shrunk = {}
|
||||
for sec, val in sector_vals.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, top_level)
|
||||
sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
|
||||
sector_shrunk[sec] = combine(val, parent, sector_n[sec])
|
||||
|
||||
# Fill sectors without their own values
|
||||
for sec in all_sectors:
|
||||
|
|
|
|||
135
pipeline/transform/price_estimation/test_index.py
Normal file
135
pipeline/transform/price_estimation/test_index.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
from pipeline.transform.price_estimation.index import (
|
||||
compute_indices_for_level,
|
||||
solve_robust_index,
|
||||
)
|
||||
|
||||
|
||||
def _pairs_from_path(true_levels: dict[int, float]):
|
||||
"""Build adjacent-year repeat-sale pairs that exactly trace a known path.
|
||||
|
||||
Each consecutive pair's log_ratio is the difference of the true log-levels,
|
||||
so the solver should recover the levels exactly (relative to the min year).
|
||||
"""
|
||||
years = sorted(true_levels)
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
for a, b in zip(years[:-1], years[1:]):
|
||||
y1.append(a)
|
||||
y2.append(b)
|
||||
lr.append(true_levels[b] - true_levels[a])
|
||||
w.append(1.0)
|
||||
return (
|
||||
np.array(y1, dtype=np.int32),
|
||||
np.array(y2, dtype=np.int32),
|
||||
np.array(lr, dtype=np.float64),
|
||||
np.array(w, dtype=np.float64),
|
||||
)
|
||||
|
||||
|
||||
def test_solver_recovers_contiguous_path():
|
||||
"""A contiguous price path is recovered as log-levels relative to min_year.
|
||||
|
||||
Proves the IRLS solver is correct (and unchanged) for contiguous data: the
|
||||
spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
|
||||
"""
|
||||
years = range(2010, 2021)
|
||||
true = {y: 0.04 * (y - 2010) for y in years} # smooth (zero curvature) ramp
|
||||
# Replicate each adjacent pair so MIN_PAIRS is comfortably met.
|
||||
y1, y2, lr, w = _pairs_from_path(true)
|
||||
y1 = np.tile(y1, 3)
|
||||
y2 = np.tile(y2, 3)
|
||||
lr = np.tile(lr, 3)
|
||||
w = np.tile(w, 3)
|
||||
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
|
||||
assert idx[2010] == 0.0 # baseline anchor
|
||||
for y in years:
|
||||
assert abs(idx[y] - (true[y] - true[2010])) < 1e-3
|
||||
|
||||
|
||||
def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
|
||||
"""FIX #5: a sharp true level jump across a multi-year gap is preserved.
|
||||
|
||||
Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
|
||||
sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
|
||||
(beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
|
||||
the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
|
||||
spacing-aware second difference relaxes the penalty across the gap.
|
||||
"""
|
||||
# True log-levels relative to min_year (2000 anchored at 0).
|
||||
true = {
|
||||
2000: 0.0,
|
||||
2001: 0.05,
|
||||
2002: 0.10,
|
||||
2015: 1.10, # sharp +1.0 jump across the gap
|
||||
2016: 1.15,
|
||||
}
|
||||
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
|
||||
def add(a, b, n=4):
|
||||
for _ in range(n):
|
||||
y1.append(a)
|
||||
y2.append(b)
|
||||
lr.append(true[b] - true[a])
|
||||
w.append(1.0)
|
||||
|
||||
# In-segment adjacent pairs.
|
||||
add(2000, 2001)
|
||||
add(2001, 2002)
|
||||
add(2015, 2016)
|
||||
# Cross-gap pairs consistent with the sharp jump.
|
||||
add(2002, 2015)
|
||||
add(2002, 2016)
|
||||
|
||||
y1 = np.array(y1, dtype=np.int32)
|
||||
y2 = np.array(y2, dtype=np.int32)
|
||||
lr = np.array(lr, dtype=np.float64)
|
||||
w = np.array(w, dtype=np.float64)
|
||||
|
||||
# Use a strong penalty to make the smoothing bias obvious.
|
||||
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
|
||||
try:
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
finally:
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
|
||||
|
||||
assert idx[2000] == 0.0 # baseline anchor
|
||||
# beta_2015 must stay near its true post-gap level, not get dragged down by a
|
||||
# spurious curvature penalty that treats the gap as a single-year step.
|
||||
assert abs(idx[2015] - true[2015]) < 0.05
|
||||
|
||||
|
||||
def test_n_pairs_counts_only_cross_year_pairs():
|
||||
"""FIX #12: same-year pairs carry zero index information and must not inflate
|
||||
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
|
||||
rows = []
|
||||
|
||||
def add_pairs(group, year1, year2, n):
|
||||
for _ in range(n):
|
||||
rows.append(
|
||||
{
|
||||
"grp": group,
|
||||
"year1": year1,
|
||||
"year2": year2,
|
||||
"log_ratio": 0.03 * (year2 - year1),
|
||||
"weight": 1.0,
|
||||
}
|
||||
)
|
||||
|
||||
# 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
|
||||
# zero-information same-year pairs that must not be counted.
|
||||
add_pairs("g", 2010, 2011, 4)
|
||||
add_pairs("g", 2011, 2012, 4)
|
||||
add_pairs("g", 2012, 2012, 3) # same-year, zero info
|
||||
|
||||
pairs = pl.DataFrame(rows)
|
||||
indices, n_pairs = compute_indices_for_level(pairs, "grp")
|
||||
|
||||
assert "g" in indices
|
||||
assert n_pairs["g"] == 8 # not 11
|
||||
|
|
@ -71,9 +71,49 @@ def test_knn_excludes_same_sale_and_uses_stable_comparables():
|
|||
),
|
||||
)
|
||||
|
||||
# The five 900k same-postcode siblings share the target's (postcode, price,
|
||||
# date) identity proxy, so they are all excluded as comparables, leaving the
|
||||
# 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
|
||||
# INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
|
||||
# exists to distinguish a target's own resale from a distinct bulk-block
|
||||
# sibling sold same-day at the same price), not ideal behaviour -- see the
|
||||
# _sale_identity_matches docstring.
|
||||
assert psm[0] == 2_500.0
|
||||
|
||||
|
||||
def test_knn_median_psm_is_deterministic():
|
||||
"""Reproducibility guard (BUG #6): within-postcode neighbours are co-located
|
||||
(one centroid per postcode), so the kNN result for dense postcodes depends on
|
||||
an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
|
||||
two identical calls against the same trees/inputs return identical output, so
|
||||
future refactors cannot silently introduce run-to-run nondeterminism."""
|
||||
sale_date = date(2026, 1, 1)
|
||||
rows = [
|
||||
{
|
||||
"Postcode": "AA1 1AA",
|
||||
"Property type": "Detached",
|
||||
"lat": 51.5000 + i * 0.00001,
|
||||
"lon": -0.1000,
|
||||
"Total floor area (sqm)": 80.0,
|
||||
"Last known price": 200_000.0 + i * 1_000.0,
|
||||
"Date of last transaction": sale_date,
|
||||
}
|
||||
for i in range(40)
|
||||
]
|
||||
df = pl.DataFrame(rows)
|
||||
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
|
||||
|
||||
args = dict(
|
||||
lat=np.array([51.5000, 51.5002]),
|
||||
lon=np.array([-0.1000, -0.1000]),
|
||||
type_groups=np.array(["Detached", "Detached"]),
|
||||
)
|
||||
first = knn_median_psm(trees, **args)
|
||||
second = knn_median_psm(trees, **args)
|
||||
|
||||
assert np.array_equal(first, second)
|
||||
|
||||
|
||||
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
|
||||
blended = guarded_blend_estimates(
|
||||
index_est=np.array([120_000.0, 1_000_000.0]),
|
||||
|
|
|
|||
|
|
@ -1,99 +1,117 @@
|
|||
"""Regression tests for common-base-year re-anchoring before blending.
|
||||
"""Regression tests for parent-base lifting before hierarchical blending.
|
||||
|
||||
Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
|
||||
year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
|
||||
different base years must be re-anchored to a single common base first, or the
|
||||
solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
|
||||
earliest year, so a cell with a shorter history sits on a later origin than its
|
||||
(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
|
||||
must first be lifted onto its parent's base at the child's first year, or the
|
||||
blend averages level-incompatible numbers (fix5-index-base-year).
|
||||
|
||||
Note: re-anchoring each cell to the *global* base year is a no-op on real data
|
||||
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
|
||||
global base is never later), which is why the fix lifts onto the *parent* at the
|
||||
child's own start year instead.
|
||||
"""
|
||||
|
||||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
hierarchical_shrinkage,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
|
||||
|
||||
|
||||
def test_reanchor_is_pure_constant_shift_preserving_differences():
|
||||
"""Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
|
||||
# Anchored at its own earliest year 2008.
|
||||
idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
|
||||
def test_lift_rebases_late_starting_child_onto_parent():
|
||||
"""A child anchored at its own later start year is lifted to the parent's level there."""
|
||||
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
|
||||
# Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
|
||||
sector = {2016: 0.0, 2024: 0.20}
|
||||
|
||||
reanchored = reanchor_dict(idx, 1996)
|
||||
# 1996 is before this dict's history -> back-fill earliest value (0.0),
|
||||
# so the shift is 0 and the dict is unchanged.
|
||||
assert reanchored[2008] == 0.0
|
||||
lifted = lift_onto_parent(sector, parent)
|
||||
|
||||
# Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
|
||||
reanchored_2010 = reanchor_dict(idx, 2010)
|
||||
assert reanchored_2010[2010] == 0.0
|
||||
# All within-dict differences are preserved under the constant shift.
|
||||
years = sorted(idx)
|
||||
for a, b in zip(years, years[1:]):
|
||||
assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
|
||||
# child[start] now equals the parent's accumulated level at that year.
|
||||
assert abs(lifted[2016] - parent[2016]) < 1e-12 # 1.20
|
||||
assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12 # 1.40
|
||||
# Pure constant shift: the child's own year-to-year move is preserved.
|
||||
assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12
|
||||
|
||||
|
||||
def test_blend_different_base_years_needs_reanchoring():
|
||||
"""Blending two dicts on different bases is biased unless re-anchored first.
|
||||
def test_lift_is_noop_when_child_starts_at_parent_base():
|
||||
"""A child whose earliest year is the parent's base (value 0) is unchanged."""
|
||||
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
|
||||
child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
|
||||
assert lift_onto_parent(child, parent) == child
|
||||
|
||||
Both cells observe the common base year 1996 but were anchored to DIFFERENT
|
||||
origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
|
||||
cells whose pair history starts at different years). They describe the SAME
|
||||
true trajectory measured from 1996, so a 50/50 blend should reproduce that
|
||||
common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
|
||||
sectorA's 1996-relative numbers, level-shifting the smoothed result.
|
||||
|
||||
def test_lift_handles_empty_inputs():
|
||||
assert lift_onto_parent({}, {2000: 0.0}) == {}
|
||||
assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
|
||||
|
||||
|
||||
def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
|
||||
"""The lift corrects comparisons that span the cell's start year, and ONLY those.
|
||||
|
||||
A property sold in 2008 (before the sector's own data begins in 2016) and
|
||||
valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
|
||||
with 1996-based parent levels and badly understates the move. Comparisons
|
||||
wholly inside the sector's own range (2016->2024) are unchanged, because the
|
||||
lift is a pure constant shift that cancels in a within-cell difference.
|
||||
"""
|
||||
base_year = 1996
|
||||
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
|
||||
sector = {2016: 0.0, 2024: 0.20} # own data starts 2016
|
||||
n = 30
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
|
||||
# True log-levels relative to 1996 (identical trajectory for both cells).
|
||||
truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
|
||||
raw = shrink_dicts(sector, parent, n) # pre-fix: blend without lifting
|
||||
fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)
|
||||
|
||||
# sectorA: anchored at 1996 (its earliest year) -> equals truth.
|
||||
sector_a = dict(truth)
|
||||
# sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
|
||||
# every year), exactly how solve_robust_index would express a cell whose
|
||||
# earliest year happened to be picked as 2008.
|
||||
shift_b = truth[2008]
|
||||
sector_b = {y: v - shift_b for y, v in truth.items()}
|
||||
# Within the sector's own range the lift changes nothing.
|
||||
assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12
|
||||
|
||||
# --- Pre-fix behaviour: blend the raw dicts directly. ---
|
||||
raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
|
||||
# Every year is pulled by half of shift_b (0.4) away from the truth.
|
||||
assert abs(raw_blend[2012] - truth[2012]) > 0.3
|
||||
assert abs(raw_blend[1996] - truth[1996]) > 0.3
|
||||
# 2008 is parent-only in both (sector absent), so both read parent[2008].
|
||||
assert abs(raw[2008] - parent[2008]) < 1e-12
|
||||
assert abs(fixed[2008] - parent[2008]) < 1e-12
|
||||
|
||||
# --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
|
||||
reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
|
||||
fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
|
||||
# Both cells now read 0 at 1996 and the true level at every shared year.
|
||||
for y in truth:
|
||||
assert abs(fixed_blend[y] - truth[y]) < 1e-9
|
||||
raw_move = raw[2024] - raw[2008]
|
||||
fixed_move = fixed[2024] - fixed[2008]
|
||||
# Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
|
||||
assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
|
||||
assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
|
||||
# The fix raises the spanning move by exactly the parent growth to the
|
||||
# sector's start year that the raw blend dropped (weighted by w).
|
||||
assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
|
||||
# Fixed move is close to the true area-level move (0.70); raw badly understates it.
|
||||
assert abs(fixed_move - 0.70) < 0.2
|
||||
assert raw_move < 0.4 * fixed_move
|
||||
|
||||
|
||||
def test_shrink_dicts_after_reanchoring_is_consistent():
|
||||
"""Shrinking a cell toward its parent must use a common origin."""
|
||||
base_year = 2000
|
||||
# Parent (national) anchored at 2000.
|
||||
parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
|
||||
# Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
|
||||
# every year), as solve_robust_index would express a cell whose earliest year
|
||||
# is later. It still observes the 2000 base year (value -0.50).
|
||||
sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
|
||||
n = 0 # no own data weight -> result should equal parent after anchoring
|
||||
def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
|
||||
"""Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
|
||||
top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
|
||||
sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
|
||||
sector_n = {"AB1 1": 300}
|
||||
# No own area/district indices -> the sector shrinks straight toward `top`.
|
||||
base_args = (
|
||||
sector,
|
||||
sector_n,
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
top,
|
||||
["AB1 1"],
|
||||
{"AB1 1": "AB1"},
|
||||
{"AB1": "AB"},
|
||||
shrink_dicts,
|
||||
)
|
||||
|
||||
reanchored_sector = reanchor_dict(sector, base_year)
|
||||
# Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
|
||||
# origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
|
||||
shrunk = shrink_dicts(reanchored_sector, parent, n)
|
||||
assert abs(shrunk[2000] - 0.0) < 1e-9
|
||||
assert abs(shrunk[2010] - 0.50) < 1e-9
|
||||
assert abs(shrunk[2020] - 1.20) < 1e-9
|
||||
without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
|
||||
with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]
|
||||
|
||||
|
||||
def test_reanchor_exact_hit_shifts_all_years():
|
||||
"""When the base year is present, subtract its value from every year."""
|
||||
idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
|
||||
reanchored = reanchor_dict(idx, 2005)
|
||||
assert reanchored[2005] == 0.0
|
||||
assert abs(reanchored[1996] - (-0.30)) < 1e-12
|
||||
assert abs(reanchored[2015] - 0.60) < 1e-12
|
||||
# Within the sector's own range: identical (pure constant shift cancels).
|
||||
assert abs(
|
||||
(with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
|
||||
) < 1e-12
|
||||
# Spanning the sector's start year: the lift raises the 2008->2024 move.
|
||||
assert (with_lift[2024] - with_lift[2008]) > (
|
||||
without_lift[2024] - without_lift[2008]
|
||||
) + 0.1
|
||||
|
|
|
|||
|
|
@ -252,6 +252,47 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
|||
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
||||
|
||||
|
||||
def test_serious_rollup_avg_yr_equals_mean_of_rollup_bars(tmp_path):
|
||||
# Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
|
||||
# 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
|
||||
# "Serious crime (avg/yr)" must equal the mean of the "Serious crime (by year)"
|
||||
# bars (which span the UNION of years any serious type occurred), NOT the sum
|
||||
# of the per-type means. Summing per-type means divides each type by its OWN
|
||||
# years-present (1 each) -> 12 + 12 = 24; the consistent rollup divides the
|
||||
# per-year serious total by the years any serious type occurred (2) -> 12.
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(crime, "2014-01", [_crime_row("2014-01", 1005, 1005, "Burglary")])
|
||||
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Robbery")])
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
avg = pl.read_parquet(output).row(0, named=True)
|
||||
# The precomputed rollup headline exists and equals the mean of the bars (12),
|
||||
# not the sum of the per-type avg/yr values (Burglary 12 + Robbery 12 = 24).
|
||||
assert "Serious crime (avg/yr)" in avg
|
||||
assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
|
||||
serious_bars = {
|
||||
p["year"]: p["count"]
|
||||
for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
|
||||
}
|
||||
assert serious_bars == {
|
||||
2014: pytest.approx(12.0, abs=0.05),
|
||||
2024: pytest.approx(12.0, abs=0.05),
|
||||
}
|
||||
mean_of_bars = sum(serious_bars.values()) / len(serious_bars)
|
||||
assert avg["Serious crime (avg/yr)"] == pytest.approx(mean_of_bars, abs=0.05)
|
||||
|
||||
|
||||
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
|
||||
# P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
|
||||
# (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
|
|||
"potential_energy_rating": "B",
|
||||
"epc_property_type": "House",
|
||||
"built_form": "Mid-Terrace",
|
||||
"inspection_date": "2024-01-02",
|
||||
"inspection_date": date(2024, 1, 2),
|
||||
"total_floor_area": 84.5,
|
||||
"number_habitable_rooms": None,
|
||||
"floor_height": 2.4,
|
||||
|
|
@ -179,6 +179,65 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
|
||||
|
||||
def test_run_dedup_prefers_valid_dated_cert_over_garbled_date(tmp_path: Path):
|
||||
# Two certificates for the same property. The cert with the garbled,
|
||||
# unparseable inspection_date must NOT be chosen as "latest": a string sort
|
||||
# nulls-first would have picked it, attaching a stale rating/floor area. The
|
||||
# valid-dated cert wins, so its rating ("C") and floor area (85) survive.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerows(
|
||||
[
|
||||
_row(
|
||||
current_energy_rating="c",
|
||||
inspection_date="2024-01-01",
|
||||
total_floor_area="85",
|
||||
),
|
||||
# Same property; an unparseable date (OCR/garbled). Under a raw
|
||||
# string descending sort "not-a-date" outranks the ISO date and
|
||||
# wins the dedup, but as a null Date it loses.
|
||||
_row(
|
||||
current_energy_rating="g",
|
||||
inspection_date="not-a-date",
|
||||
total_floor_area="40",
|
||||
),
|
||||
]
|
||||
)
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000],
|
||||
"date_of_transfer": [date(2024, 2, 3)],
|
||||
"property_type": ["T"],
|
||||
"postcode": ["AA1 1AA"],
|
||||
"paon": ["1"],
|
||||
"saon": [None],
|
||||
"street": ["Example Street"],
|
||||
"locality": [None],
|
||||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
"ppd_category": ["A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# The valid-dated cert's facts are kept; the garbled-date cert is NOT chosen.
|
||||
assert df.select("current_energy_rating", "total_floor_area").to_dicts() == [
|
||||
{"current_energy_rating": "C", "total_floor_area": 85.0}
|
||||
]
|
||||
|
||||
|
||||
def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ from pipeline.transform.merge import (
|
|||
_build_unmatched_listing_seed_rows,
|
||||
_canonical_postcode_expr,
|
||||
_coalesce_direct_epc_columns,
|
||||
_dedupe_collapsed_properties,
|
||||
_filter_to_active_english_postcodes,
|
||||
_join_area_side_tables,
|
||||
_finalize_listings,
|
||||
|
|
@ -193,6 +194,159 @@ def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() ->
|
|||
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
|
||||
|
||||
|
||||
def test_postcode_feature_validation_rejects_wrong_count() -> None:
|
||||
# The universe-size invariant: the postcode feature output must contain
|
||||
# EXACTLY the active-England universe. Too few rows (silently dropped
|
||||
# postcodes) and too many / duplicated rows (a join fan-out) must both fail,
|
||||
# so neither a truncated build nor a one-to-many join can ship.
|
||||
too_few = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA"],
|
||||
"lat": [51.0],
|
||||
"lon": [-0.1],
|
||||
"ctry25cd": ["E92000001"],
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match="active England postcode universe"):
|
||||
_validate_postcode_feature_output(too_few, expected_postcode_count=2)
|
||||
|
||||
too_many = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
|
||||
"lat": [51.0, 52.0, 53.0],
|
||||
"lon": [-0.1, -0.2, -0.3],
|
||||
"ctry25cd": ["E92000001"] * 3,
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match="active England postcode universe"):
|
||||
_validate_postcode_feature_output(too_many, expected_postcode_count=2)
|
||||
|
||||
# Right row count but a duplicated key (n_unique < height) -- the signature of
|
||||
# a join fan-out.
|
||||
duplicated = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "AA1 1AA"],
|
||||
"lat": [51.0, 51.0],
|
||||
"lon": [-0.1, -0.1],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
}
|
||||
)
|
||||
with pytest.raises(ValueError, match="active England postcode universe"):
|
||||
_validate_postcode_feature_output(duplicated, expected_postcode_count=2)
|
||||
|
||||
|
||||
def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
|
||||
# Soundness: with side tables unique on their join key, the per-postcode
|
||||
# feature joins emit exactly one row per postcode (no fan-out). A fan-out here
|
||||
# would inflate the postcode universe above the active-England count -- the
|
||||
# failure the universe assertion above is the backstop for.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||
"pcon": ["E14000001", "E14000002"],
|
||||
}
|
||||
)
|
||||
|
||||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"Serious crime (avg/yr)": [1.0, 2.0],
|
||||
"Minor crime (avg/yr)": [3.0, 4.0],
|
||||
}
|
||||
)
|
||||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame(
|
||||
{
|
||||
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
|
||||
}
|
||||
),
|
||||
).collect()
|
||||
|
||||
# One row per postcode in -> one row out; the universe is not inflated.
|
||||
assert joined.height == 2
|
||||
assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
|
||||
|
||||
|
||||
def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
|
||||
# Broadband comes straight from Ofcom's CSV, so its postcode can drift in
|
||||
# spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
|
||||
# to the same canonical form so a real postcode populates
|
||||
# `max_download_speed` instead of silently missing the left join.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||
"pcon": ["E14000001", "E14000002"],
|
||||
}
|
||||
)
|
||||
|
||||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
|
||||
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
"Serious crime (avg/yr)": [1.0, 2.0],
|
||||
"Minor crime (avg/yr)": [3.0, 4.0],
|
||||
}
|
||||
)
|
||||
# AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
|
||||
# raw spellings that canonicalize to one key (the max speed must win, with
|
||||
# no fan-out of the base row).
|
||||
broadband = pl.LazyFrame(
|
||||
{
|
||||
"bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
|
||||
"max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
|
||||
}
|
||||
)
|
||||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=broadband,
|
||||
).collect()
|
||||
|
||||
# No fan-out: still one row per base postcode.
|
||||
assert joined.height == 2
|
||||
speeds = dict(
|
||||
zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
|
||||
)
|
||||
# Spacing/casing drift still joins.
|
||||
assert speeds["AB1 2CD"] == 300
|
||||
# Two raw spellings collapse to one canonical key; the max wins.
|
||||
assert speeds["EF3 4GH"] == 1000
|
||||
# The temporary canonical join key is not leaked into the output schema.
|
||||
assert "_base_canonical_postcode" not in joined.columns
|
||||
assert "_bb_canonical_postcode" not in joined.columns
|
||||
assert "bb_postcode" not in joined.columns
|
||||
|
||||
|
||||
def test_listed_building_feature_is_property_level() -> None:
|
||||
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
||||
|
||||
|
|
@ -758,8 +912,10 @@ def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
|
|||
|
||||
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
||||
# The crime table is LEFT-joined per postcode; a postcode absent from it
|
||||
# must NOT be fabricated as "zero crime" (the safest value). When every
|
||||
# per-type column is null the Serious/Minor rollups must stay null.
|
||||
# must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
|
||||
# rollups are precomputed in crime_spatial (the mean of the by-year rollup
|
||||
# bars), so the merge reads them straight through; a missing postcode leaves
|
||||
# them null.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
|
|
@ -772,7 +928,10 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||
|
||||
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
|
||||
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
|
||||
# rollup headlines are precomputed values (deliberately NOT the per-type sum,
|
||||
# which would be 10.0 each) so this test proves the merge consumes the
|
||||
# precomputed column rather than re-summing per-type columns.
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA"],
|
||||
|
|
@ -790,6 +949,8 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
"Public order (avg/yr)": [1.0],
|
||||
"Drugs (avg/yr)": [1.0],
|
||||
"Other crime (avg/yr)": [1.0],
|
||||
"Serious crime (avg/yr)": [7.5],
|
||||
"Minor crime (avg/yr)": [4.2],
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -805,7 +966,12 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
|
||||
broadband=pl.LazyFrame(
|
||||
{
|
||||
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
|
||||
}
|
||||
),
|
||||
).collect()
|
||||
|
||||
by_postcode = {
|
||||
|
|
@ -814,14 +980,50 @@ def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
|||
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
|
||||
).iter_rows(named=True)
|
||||
}
|
||||
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
|
||||
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
|
||||
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
|
||||
# Present postcode: rollups are the precomputed headline values, read through
|
||||
# unchanged (NOT the per-type sum of 10.0).
|
||||
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
|
||||
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
|
||||
# Missing postcode: rollups stay null rather than fabricating 0.0.
|
||||
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
|
||||
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
|
||||
|
||||
|
||||
def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
|
||||
# The terminated-postcode remap can merge two distinct postcodes onto one
|
||||
# active successor, collapsing the same physical address onto a single
|
||||
# (postcode, pp_address) key with conflicting sale records. The dedup must
|
||||
# keep exactly one row per (postcode, pp_address) -- the most recent
|
||||
# transaction -- and must not collapse genuinely distinct addresses.
|
||||
from datetime import datetime
|
||||
|
||||
wide = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
|
||||
"pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
|
||||
"date_of_transfer": [
|
||||
datetime(1990, 1, 1),
|
||||
datetime(2015, 6, 1),
|
||||
datetime(2000, 1, 1),
|
||||
],
|
||||
"latest_price": [1_587_700, 4_500_000, 250_000],
|
||||
}
|
||||
)
|
||||
|
||||
out = _dedupe_collapsed_properties(wide).collect()
|
||||
|
||||
# One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
|
||||
assert out.height == 2
|
||||
assert out.select(["postcode", "pp_address"]).is_unique().all()
|
||||
by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
|
||||
# The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
|
||||
# not an arbitrary one.
|
||||
assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
|
||||
assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
|
||||
# A genuinely distinct address in the same postcode is untouched.
|
||||
assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
|
||||
|
||||
|
||||
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"postcode": "AA1 1AA",
|
||||
|
|
|
|||
110
pipeline/transform/test_noise_overlay_tiles.py
Normal file
110
pipeline/transform/test_noise_overlay_tiles.py
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
import numpy as np
|
||||
import rasterio
|
||||
from rasterio.transform import from_origin
|
||||
from rasterio.warp import transform_bounds
|
||||
|
||||
from pipeline.transform import noise_overlay_tiles
|
||||
from pipeline.transform.noise_overlay_tiles import RasterInfo, _read_noise_tile
|
||||
|
||||
|
||||
def _write_corridor_raster(path, nodata=-96.0):
|
||||
"""A small EPSG:27700 raster: a column of 70 dB cells adjacent to genuine
|
||||
0.0 (quiet) cells. Bilinear blending of the 0 cells would fabricate a halo
|
||||
of intermediate dB values between 0 and 70."""
|
||||
# 8x8 grid: leftmost two columns are 70 dB, the rest are genuine quiet 0.0.
|
||||
data = np.zeros((8, 8), dtype=np.float32)
|
||||
data[:, 0:2] = 70.0
|
||||
# Place one true nodata cell to make sure it is also masked out.
|
||||
data[0, 7] = nodata
|
||||
|
||||
# 10m cells anchored somewhere inside England's BNG extent.
|
||||
left = 300_000.0
|
||||
top = 300_080.0
|
||||
transform = from_origin(left, top, 10.0, 10.0)
|
||||
with rasterio.open(
|
||||
path,
|
||||
"w",
|
||||
driver="GTiff",
|
||||
height=data.shape[0],
|
||||
width=data.shape[1],
|
||||
count=1,
|
||||
dtype=data.dtype,
|
||||
crs="EPSG:27700",
|
||||
transform=transform,
|
||||
nodata=nodata,
|
||||
) as dataset:
|
||||
dataset.write(data, 1)
|
||||
return path
|
||||
|
||||
|
||||
def test_read_noise_tile_does_not_fabricate_halo(tmp_path):
|
||||
raster_path = _write_corridor_raster(tmp_path / "corridor.tif")
|
||||
|
||||
with rasterio.open(raster_path) as dataset:
|
||||
bounds_27700 = dataset.bounds
|
||||
bounds_mercator = transform_bounds(
|
||||
dataset.crs,
|
||||
noise_overlay_tiles.WEB_MERCATOR_CRS,
|
||||
*bounds_27700,
|
||||
densify_pts=21,
|
||||
)
|
||||
|
||||
info = RasterInfo(path=raster_path, bounds_mercator=bounds_mercator)
|
||||
|
||||
# Render at high resolution so any bilinear halo would surface as
|
||||
# intermediate dB values along the corridor/quiet seam.
|
||||
tile_size = 64
|
||||
tile = _read_noise_tile([info], bounds_mercator, tile_size)
|
||||
|
||||
finite = tile[np.isfinite(tile)]
|
||||
# Every finite cell must be the genuine corridor value (~70). There must be
|
||||
# NO fabricated halo strictly between 0 and 70.
|
||||
halo = finite[(finite > 0.0) & (finite < 70.0 - 1e-3)]
|
||||
assert halo.size == 0, f"fabricated halo values present: {np.unique(halo)}"
|
||||
# Sanity: the corridor itself must still be rendered.
|
||||
assert finite.size > 0
|
||||
assert np.all(finite >= 70.0 - 1e-3)
|
||||
|
||||
|
||||
def test_read_noise_tile_preserves_peak_under_downsample(tmp_path):
|
||||
# 8x8 EPSG:27700 raster: a single loud 75 dB cell in a 50 dB field.
|
||||
# Downsampling into a smaller tile with bilinear would dilute the peak
|
||||
# (arithmetic dB averaging); Resampling.max must keep the worst-case dB.
|
||||
data = np.full((8, 8), 50.0, dtype=np.float32)
|
||||
data[4, 4] = 75.0
|
||||
transform = from_origin(300_000.0, 300_080.0, 10.0, 10.0)
|
||||
raster_path = tmp_path / "peak.tif"
|
||||
with rasterio.open(
|
||||
raster_path,
|
||||
"w",
|
||||
driver="GTiff",
|
||||
height=data.shape[0],
|
||||
width=data.shape[1],
|
||||
count=1,
|
||||
dtype=data.dtype,
|
||||
crs="EPSG:27700",
|
||||
transform=transform,
|
||||
nodata=-96.0,
|
||||
) as dataset:
|
||||
dataset.write(data, 1)
|
||||
|
||||
with rasterio.open(raster_path) as dataset:
|
||||
bounds_mercator = transform_bounds(
|
||||
dataset.crs,
|
||||
noise_overlay_tiles.WEB_MERCATOR_CRS,
|
||||
*dataset.bounds,
|
||||
densify_pts=21,
|
||||
)
|
||||
|
||||
info = RasterInfo(path=raster_path, bounds_mercator=bounds_mercator)
|
||||
|
||||
# Render the 8x8 source into a 4x4 tile: this downsamples, so bilinear
|
||||
# would average the 75 dB peak away.
|
||||
tile = _read_noise_tile([info], bounds_mercator, 4)
|
||||
finite = tile[np.isfinite(tile)]
|
||||
|
||||
assert finite.size > 0
|
||||
# The loud peak must survive the downsample (max, not arithmetic mean).
|
||||
assert finite.max() >= 75.0 - 1e-3, f"peak diluted to {finite.max()}"
|
||||
# Max resampling must never invent a value louder than the source.
|
||||
assert finite.max() <= 75.0 + 1e-3
|
||||
|
|
@ -1,12 +1,115 @@
|
|||
import json
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.transform_poi import (
|
||||
_load_ofsted_ratings,
|
||||
_school_icon_category_expr,
|
||||
transform,
|
||||
transform_grocery_retail_points,
|
||||
)
|
||||
|
||||
|
||||
def _write_boundary(tmp_path):
|
||||
"""A FeatureCollection whose single feature covers the London-area test
|
||||
coords used by the transform() fixtures, so in_england_mask keeps them."""
|
||||
boundary_path = tmp_path / "england.geojson"
|
||||
coords = [[-1.0, 51.0], [1.0, 51.0], [1.0, 52.0], [-1.0, 52.0], [-1.0, 51.0]]
|
||||
boundary_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "FeatureCollection",
|
||||
"features": [
|
||||
{
|
||||
"type": "Feature",
|
||||
"properties": {},
|
||||
"geometry": {"type": "Polygon", "coordinates": [coords]},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
)
|
||||
return boundary_path
|
||||
|
||||
|
||||
def _write_transform_inputs(tmp_path, raw_pois: pl.DataFrame):
|
||||
"""Materialise the parquet inputs transform() requires around a given raw
|
||||
OSM POIs frame. NaPTAN / grocery / GIAS / Ofsted are minimal but valid."""
|
||||
input_path = tmp_path / "pois.parquet"
|
||||
raw_pois.write_parquet(input_path)
|
||||
|
||||
naptan_path = tmp_path / "naptan.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"id": ["naptan-1"],
|
||||
"name": ["Test Rail Station"],
|
||||
"category": ["Rail station"],
|
||||
"lat": [51.51],
|
||||
"lng": [-0.13],
|
||||
}
|
||||
).write_parquet(naptan_path)
|
||||
|
||||
grocery_path = tmp_path / "grocery.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"id": list(range(1, 6)),
|
||||
"retailer": ["Tesco"] * 5,
|
||||
"fascia": ["Tesco"] * 5,
|
||||
"store_name": [f"Tesco Test {i}" for i in range(1, 6)],
|
||||
"long_wgs": [-0.14] * 5,
|
||||
"lat_wgs": [51.52] * 5,
|
||||
}
|
||||
).write_parquet(grocery_path)
|
||||
|
||||
gias_path = tmp_path / "gias.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"urn": [1001],
|
||||
"name": ["Test Primary School"],
|
||||
"phase": ["Primary"],
|
||||
"type": ["Community school"],
|
||||
"type_group": ["Local authority maintained schools"],
|
||||
"age_range": ["4–11"],
|
||||
"gender": ["Mixed"],
|
||||
"religious_character": [None],
|
||||
"admissions_policy": ["Comprehensive"],
|
||||
"nursery_provision": ["No"],
|
||||
"sixth_form": ["No"],
|
||||
"capacity": [200],
|
||||
"pupils": [180],
|
||||
"fsm_percent": [12.5],
|
||||
"trust": [None],
|
||||
"address": ["1 Test Street"],
|
||||
"postcode": ["E1 1AA"],
|
||||
"local_authority": ["Test LA"],
|
||||
"website": [None],
|
||||
"telephone": ["02012345678"],
|
||||
"head_name": ["Jane Doe"],
|
||||
"lat": [51.53],
|
||||
"lng": [-0.12],
|
||||
}
|
||||
).write_parquet(gias_path)
|
||||
|
||||
ofsted_path = tmp_path / "ofsted.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"URN": [1001],
|
||||
"Latest OEIF overall effectiveness": ["2"],
|
||||
"Ungraded inspection overall outcome": [None],
|
||||
}
|
||||
).write_parquet(ofsted_path)
|
||||
|
||||
boundary_path = _write_boundary(tmp_path)
|
||||
return {
|
||||
"input_path": input_path,
|
||||
"naptan_path": naptan_path,
|
||||
"boundary_path": boundary_path,
|
||||
"grocery_retail_points_path": grocery_path,
|
||||
"gias_path": gias_path,
|
||||
"ofsted_path": ofsted_path,
|
||||
}
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_outputs_chain_categories():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
|
|
@ -292,3 +395,79 @@ def test_school_icon_category_handles_one_sided_age_ranges():
|
|||
"Primary school",
|
||||
"School",
|
||||
]
|
||||
|
||||
|
||||
def test_transform_dedupes_multi_tag_pois(tmp_path):
|
||||
# One OSM object can carry several tag keys that map to the SAME friendly
|
||||
# category, so pois.py emits one raw row per key with the SAME id.
|
||||
# "amenity/pharmacy" and "shop/chemist" both map to "Pharmacy".
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n42", "n42"],
|
||||
"name": ["Boots", "Boots"],
|
||||
"category": ["amenity/pharmacy", "shop/chemist"],
|
||||
"lat": [51.50, 51.50],
|
||||
"lng": [-0.10, -0.10],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
# No (id, category) pair appears more than once.
|
||||
assert out.group_by("id", "category").len()["len"].max() == 1
|
||||
# The single physical pharmacy is present exactly once.
|
||||
pharmacies = out.filter(
|
||||
(pl.col("id") == "n42") & (pl.col("category") == "Pharmacy")
|
||||
)
|
||||
assert pharmacies.height == 1
|
||||
|
||||
|
||||
def test_osm_supermarkets_dropped(tmp_path):
|
||||
# GEOLYTIX is authoritative for supermarkets; an OSM "shop/supermarket" row
|
||||
# must not flow through as a second Groceries/Supermarket pin. A
|
||||
# complementary grocery category (Convenience Store) must still survive.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n2"],
|
||||
"name": ["Some Supermarket", "Corner Shop"],
|
||||
"category": ["shop/supermarket", "shop/convenience"],
|
||||
"lat": [51.50, 51.51],
|
||||
"lng": [-0.10, -0.11],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
osm_supermarkets = out.filter(
|
||||
(pl.col("group") == "Groceries") & (pl.col("category") == "Supermarket")
|
||||
)
|
||||
assert osm_supermarkets.height == 0
|
||||
# Complementary OSM grocery category survives.
|
||||
convenience = out.filter(pl.col("category") == "Convenience Store")
|
||||
assert convenience.height == 1
|
||||
|
||||
|
||||
def test_transform_output_unique_per_id_category(tmp_path):
|
||||
# Soundness: the full transform() output has at most one row per
|
||||
# (id, category) overall, across every source.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n42", "n42", "n7", "n8"],
|
||||
"name": ["Boots", "Boots", "St Mary's", "St Mary's"],
|
||||
"category": [
|
||||
"amenity/pharmacy",
|
||||
"shop/chemist",
|
||||
"amenity/place_of_worship",
|
||||
"building/church",
|
||||
],
|
||||
"lat": [51.50, 51.50, 51.55, 51.55],
|
||||
"lng": [-0.10, -0.10, -0.15, -0.15],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
assert out.group_by("id", "category").len()["len"].max() == 1
|
||||
|
|
|
|||
|
|
@ -6,6 +6,10 @@ import polars as pl
|
|||
from pipeline.utils.england_geometry import in_england_mask
|
||||
|
||||
DROP_CATEGORIES = {
|
||||
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
|
||||
# (transform_grocery_retail_points), so drop OSM supermarkets to avoid
|
||||
# double-counting each store as both a GEOLYTIX brand and an OSM "Supermarket".
|
||||
"shop/supermarket",
|
||||
# Street furniture & infrastructure
|
||||
"amenity/advice",
|
||||
"amenity/atm",
|
||||
|
|
@ -364,14 +368,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"leisure/yes",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Groceries",
|
||||
"Supermarket",
|
||||
"🛒",
|
||||
[
|
||||
"shop/supermarket",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Groceries",
|
||||
"Convenience Store",
|
||||
|
|
@ -1534,6 +1530,14 @@ def transform(
|
|||
pl.col("category").replace_strict(emoji_mapping).alias("emoji"),
|
||||
)
|
||||
|
||||
# A single OSM object can carry several tag keys that map to the same
|
||||
# friendly category (e.g. amenity/pharmacy + shop/chemist -> "Pharmacy"),
|
||||
# which pois.py emits as multiple raw rows sharing one id. Collapse those
|
||||
# duplicates so they don't inflate downstream proximity counts; rows sharing
|
||||
# an id with DIFFERENT categories are preserved. Other sources are
|
||||
# pre-deduplicated.
|
||||
lf = lf.unique(subset=["id", "category"], keep="first", maintain_order=True)
|
||||
|
||||
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue