SPlit up
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s

This commit is contained in:
Andras Schmelczer 2026-06-12 21:51:37 +01:00
parent cf39ad754e
commit f59d01227b
91 changed files with 10370 additions and 7562 deletions

View file

@ -123,10 +123,13 @@ def transform_crime(
)
yearly_counts = (
filtered.group_by("LSOA code", "year", "Crime type", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.col("count").sum().alias("count"))
# Sum per-incident weights directly: a 2021 LSOA can receive incidents
# carrying different `_weight`s in the same month (split 2011 parent at
# 1/N alongside an unsplit one at 1), so `_weight.first() * len` would
# apply one row's weight to all of them — and nondeterministically so,
# since `first` after a join has no ordering guarantee.
filtered.group_by("LSOA code", "year", "Crime type")
.agg(pl.col("_weight").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
@ -191,10 +194,10 @@ def _write_crime_by_year(
)
yearly_per_type = (
filtered.group_by("LSOA code", "Crime type", "year", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type", "year")
.agg(pl.col("count").sum().alias("count"))
# Per-incident weight sum, not `_weight.first() * len` — see the
# matching comment in transform_crime.
filtered.group_by("LSOA code", "Crime type", "year")
.agg(pl.col("_weight").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))

View file

@ -97,6 +97,13 @@ def epc_band_to_year(band: pl.Expr) -> pl.Expr:
EPC_SOURCE_COLUMNS = [
"address",
# The individual lines behind `address` (= address1+2+3): address2/3
# frequently carry a village/locality token that the price-paid address
# lacks, so the matcher also scores against address1-only and
# address1+address2 variants (see fuzzy_join_on_postcode's variant
# columns).
"address1",
"address2",
"postcode",
"uprn",
"current_energy_rating",
@ -150,6 +157,12 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
return (
raw.select(
_clean_string("address").alias("epc_address"),
# Match variants: the full address minus the locality-bearing
# trailing lines. Inadmissible variants (ones whose dropped lines
# carry numbers or flat designators) are filtered inside the
# fuzzy join.
_join_address_parts("address1").alias("epc_address_a1"),
_join_address_parts("address1", "address2").alias("epc_address_a12"),
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
# UPRN keys an exact listing->EPC join downstream (~99% populated).
_clean_string("uprn").alias("uprn"),
@ -536,6 +549,12 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
.filter(pl.col("pp_property_type") != "Other")
.with_columns(
_join_address_parts("saon", "paon", "street").alias("pp_address"),
# Match variant with the locality appended: the EPC address often
# carries a village/locality token the bare saon+paon+street
# lacks, which alone drags short addresses below the threshold.
_join_address_parts("saon", "paon", "street", "locality").alias(
"pp_address_loc"
),
)
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
@ -597,6 +616,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
.group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
.agg(
pl.col("pp_address").last(),
pl.col("pp_address_loc").last(),
pl.col("postcode").last(),
pl.col("_pp_match_address").last(),
pl.col("_pp_match_postcode").last(),
@ -633,6 +653,8 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
right_address_col="epc_address",
left_postcode_col="postcode",
right_postcode_col="epc_postcode",
left_variant_cols=["pp_address_loc"],
right_variant_cols=["epc_address_a1", "epc_address_a12"],
)
.drop("epc_postcode")
# Audit trail: keep the fuzzy-match confidence (100 = exact address
@ -672,6 +694,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
[
"old_new",
"first_transfer_date",
"pp_address_loc",
"epc_address_a1",
"epc_address_a12",
"_pp_match_address",
"_pp_match_postcode",
"_pp_group_address",

View file

@ -24,9 +24,12 @@ from pipeline.transform.price_estimation.knn import (
MIN_COMPARABLE_PSM,
)
from pipeline.utils.fuzzy_join import (
_NUMBER_RE as _SUFFIXED_NUMBER_RE,
_numbers_compatible as _equal_numbers_compatible,
normalize_address_key,
normalize_postcode_key,
)
from pipeline.utils.normalize import drop_digit_tokens
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
@ -209,8 +212,15 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
)
def _numbers_compatible(left: str, right: str) -> bool:
"""Require address/list-entry numbers to agree when either side has numbers."""
def _subset_numbers_compatible(left: str, right: str) -> bool:
"""Require one side's numbers to be a subset of the other's.
Subset (not equality) is correct ONLY for listed-building name matching: a
list entry like "10-12 HIGH STREET" should flag "10 HIGH STREET". Address-
to-address matching must use the canonical `fuzzy_join._numbers_compatible`
instead (set equality over ``\\d+[A-Z]?`` tokens) subset semantics there
let a single flat absorb its whole building (see fuzzy_join docstring).
"""
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
@ -446,7 +456,7 @@ def _matched_listed_building_flags(
matched = False
for address_key in address_keys:
for listed_name in listed_names:
if not _numbers_compatible(address_key, listed_name):
if not _subset_numbers_compatible(address_key, listed_name):
continue
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
matched = True
@ -1152,8 +1162,9 @@ def _address_score(query: str, candidate: str | None, *, allow_token_set: bool)
# token (e.g. "KINGSWOOD") subsets to 100 against any long address that
# merely contains it — so number-less queries score with token_sort_ratio
# only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
# query the unconditional _numbers_compatible gate has already guaranteed the
# candidate carries compatible house numbers, so token_set cannot inflate
# query the unconditional fuzzy_join._numbers_compatible gate has already
# guaranteed the candidate carries identical house numbers, so token_set
# cannot inflate
# across different addresses; allowing it recovers genuine matches where the
# scraped listing appends trailing town/county tokens the bare register
# address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
@ -1213,7 +1224,7 @@ def _rooms_bonus(left: int | None, right: int | None) -> float:
def _street_only_address(address: str) -> str:
"""The street/locality part of a normalised address: digit-bearing tokens
(house numbers, flat numbers, including letter suffixes like 8A) removed."""
return " ".join(token for token in address.split() if not _NUMBER_RE.search(token))
return drop_digit_tokens(address)
def _is_specific_street_query(query: str) -> bool:
@ -1262,9 +1273,9 @@ def _best_listing_match(
``uprn_index`` (postcode-independent, so it is robust even when the
listing's postcode is slightly off); (2) failing that, the highest
fuzzy street-address similarity within the listing's own postcode bucket.
No property-attribute heuristics are used `_numbers_compatible` gates
every fuzzy match unconditionally (so a number-less listing can never match
a numbered property, and vice versa), as in the canonical
No property-attribute heuristics are used `fuzzy_join._numbers_compatible`
gates every fuzzy match unconditionally (so a number-less listing can never
match a numbered property, and vice versa), as in the canonical
`fuzzy_join._score_bucket`. A house number additionally lowers the score
threshold and (via `_address_score`) permits token_set scoring; a number-less
address scores on token_sort only and must match the street almost exactly.
@ -1294,9 +1305,11 @@ def _best_listing_match(
address = candidate.get(field)
if not address:
continue
# Unconditional number gate (matches fuzzy_join): a number-less
# listing cannot match a numbered candidate and vice versa.
if not _numbers_compatible(query, address):
# Unconditional number gate (the canonical fuzzy_join one: set
# equality over suffix-aware tokens): a number-less listing cannot
# match a numbered candidate, 8A cannot match 8B, and a flat
# cannot absorb its whole building.
if not _equal_numbers_compatible(query, address):
continue
score = _address_score(query, address, allow_token_set=listing_has_numbers)
if score > best_score:
@ -1388,7 +1401,7 @@ def _best_street_epc_fallback(
street_score_cache[cache_key] = qualifying
listing_postcode = listing.get("_listing_match_postcode")
listing_numbers = set(_NUMBER_RE.findall(query))
listing_numbers = set(_SUFFIXED_NUMBER_RE.findall(query))
best: dict | None = None
best_total = float("-inf")
best_street_score = 0
@ -1417,7 +1430,9 @@ def _best_street_epc_fallback(
):
total += _STREET_FALLBACK_SAME_POSTCODE_BONUS
if listing_numbers and listing_numbers & set(
_NUMBER_RE.findall(candidate.get("_direct_epc_match_address") or "")
_SUFFIXED_NUMBER_RE.findall(
candidate.get("_direct_epc_match_address") or ""
)
):
total += _STREET_FALLBACK_NUMBER_OVERLAP_BONUS
if total > best_total:

View file

@ -88,6 +88,12 @@ SECONDARY_AGES = (11, 15)
NURSERY_COHORT_WEIGHT = 0.5 # ages < 4
SIXTH_FORM_COHORT_WEIGHT = 0.6 # ages >= 16
# Assumed bounds for the one-sided age-range shapes GIAS emits when a
# statutory age is missing: "up to {high}" starts at the earliest nursery
# intake, "{low}+" runs to the end of sixth form.
EARLIEST_INTAKE_AGE = 2
DEFAULT_LEAVING_AGE = 19
# Only schools that admit (mostly) by geography take part in the assignment.
# Independent, special and Welsh schools and post-16 colleges either don't
# admit by distance or fall outside the England postcode universe; selective
@ -296,11 +302,28 @@ def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
e.g. "311" = ages 3..10) with nursery and sixth-form ages down-weighted,
and each phase receives the share of cohort weight in its age band.
"""
ages = pl.col("age_range").str.extract_all(r"\d+")
low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
# gias._format_age_range emits three shapes: "{low}{high}", "up to {high}"
# (StatutoryLowAge missing) and "{low}+" (StatutoryHighAge missing). Parse
# all three — the one-sided shapes previously fell through the two-number
# parse and silently dropped the school from the catchment supply.
age = pl.col("age_range")
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int64, strict=False)
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int64, strict=False)
low = (
pl.when(age.str.starts_with("up to"))
.then(pl.lit(EARLIEST_INTAKE_AGE, dtype=pl.Int64))
.otherwise(leading)
)
# The leaving age is exclusive as a cohort: a "3-11" school teaches
# children aged 3 through 10.
high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
# children aged 3 through 10. "{low}+" schools get the end of sixth form
# as their assumed leaving age (post-19 institutions then carry no
# primary/secondary cohort weight and drop out naturally).
high = (
pl.when(age.str.ends_with("+"))
.then(pl.lit(DEFAULT_LEAVING_AGE, dtype=pl.Int64))
.otherwise(trailing)
- 1
)
schools = (
gias.filter(

View file

@ -275,6 +275,51 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
def test_transform_crime_sums_mixed_weights_within_a_target_lsoa(tmp_path):
"""Irregular (M:N) recodes can land rows with DIFFERENT `_weight`s in the
same (lsoa21, year, type) group: here E01000050 receives 0.5-weighted
incidents from split E01000001 alongside a 1.0-weighted incident from
E01000099. The aggregation must sum per-incident weights; the old
`_weight.first() * len` applied one row's weight to all three
(nondeterministically 1.5 or 3.0 instead of 2.0)."""
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
(month_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
"2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
"3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
]
)
+ "\n"
)
lookup_path = tmp_path / "lookup.parquet"
pl.DataFrame(
{
"lsoa11": ["E01000001", "E01000001", "E01000099"],
"lsoa21": ["E01000050", "E01000051", "E01000050"],
}
).write_parquet(lookup_path)
output = tmp_path / "crime.parquet"
by_year_output = tmp_path / "by_year.parquet"
transform_crime(crime_dir, output, by_year_output, lookup_path)
# E01000050: 0.5 + 0.5 + 1.0 = 2.0 incidents -> 24/yr annualised.
# E01000051: 0.5 + 0.5 = 1.0 incident -> 12/yr.
avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
assert avg == [
{"LSOA code": "E01000050", "Burglary (avg/yr)": 24.0},
{"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
]
def test_transform_crime_maps_legacy_crime_types(tmp_path):
"""Pre-2014 police.uk type names are aliased to current equivalents instead
of being dropped."""

View file

@ -25,6 +25,8 @@ def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) ->
def _row(**overrides: str) -> dict[str, str]:
row = {
"address": "1 Example Street",
"address1": "1 Example Street",
"address2": "Hale",
"postcode": " aa1 1aa ",
"uprn": "100012345678",
"current_energy_rating": "c",
@ -54,6 +56,8 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
assert df.to_dicts() == [
{
"epc_address": "1 Example Street",
"epc_address_a1": "1 Example Street",
"epc_address_a12": "1 Example Street Hale",
"epc_postcode": "AA1 1AA",
"uprn": "100012345678",
"current_energy_rating": "C",

View file

@ -1609,6 +1609,37 @@ def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers(
assert result is None
def test_best_listing_match_letter_suffix_flats_do_not_cross_match() -> None:
# Regression: the gate uses fuzzy_join's suffix-aware tokens, so "8A" and
# "8B" are different numbers. Under the old digit-only tokens both looked
# like {8} and token_sort scored ~93, attaching the wrong flat's record
# whenever the true candidate was absent from the bucket.
candidates = [{"pp_address": "8B HIGH STREET"}]
result = _best_listing_match(
listing_uprn=None,
query="8A HIGH STREET",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_best_listing_match_building_listing_cannot_absorb_single_flat() -> None:
# Regression: set equality (not subset) over number tokens, so a whole-
# building listing "188 GREAT NORTH WAY" no longer matches "FLAT 1 188
# GREAT NORTH WAY" (token_set would have scored the pair 100).
candidates = [{"pp_address": "FLAT 1 188 GREAT NORTH WAY"}]
result = _best_listing_match(
listing_uprn=None,
query="188 GREAT NORTH WAY",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
None
):

View file

@ -191,6 +191,28 @@ def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
def test_phase_intakes_parses_one_sided_age_ranges():
"""gias._format_age_range emits "up to {high}" and "{low}+" when a
statutory age is missing; those schools must stay in the catchment supply
instead of being silently dropped by a two-number parse."""
intakes = phase_intakes(
pl.DataFrame(
[
# "up to 11" = assumed cohorts 2..10: nursery years 2-3 weigh
# 0.5 each, primary 4..10 weighs 7 -> primary 210 * 7/8.
_gias_row(1, age_range="up to 11", pupils=210),
# "16+" = assumed cohorts 16..18, all sixth form: no
# primary/secondary intake, so the school contributes nothing
# but must not crash the parse.
_gias_row(2, age_range="16+", pupils=400),
]
)
).sort("urn")
assert intakes["urn"].to_list() == [1, 2]
assert intakes["primary_intake"].to_list() == [210.0 * 7 / 8, 0.0]
assert intakes["secondary_intake"].to_list() == [0.0, 0.0]
def test_phase_intakes_excludes_non_state_and_selective_schools():
intakes = phase_intakes(
pl.DataFrame(

View file

@ -5,6 +5,7 @@ import numpy as np
import polars as pl
from pipeline.utils.england_geometry import in_england_mask
from pipeline.utils.normalize import strip_or_empty
DROP_CATEGORIES = {
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
@ -1313,9 +1314,7 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
def normalize_grocery_retailer(retailer: str | None) -> str:
if retailer is None:
return ""
retailer = retailer.strip()
retailer = strip_or_empty(retailer)
if retailer in COOP_RETAILERS:
return "Co-op"
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)