SPlit up

2026-06-12 21:51:37 +01:00 · 2026-06-12 21:51:37 +01:00 · f59d01227b
commit f59d01227b
parent cf39ad754e
91 changed files with 10370 additions and 7562 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -123,10 +123,13 @@ def transform_crime(
    )

    yearly_counts = (
-        filtered.group_by("LSOA code", "year", "Crime type", "Month")
-        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
-        .group_by("LSOA code", "year", "Crime type")
-        .agg(pl.col("count").sum().alias("count"))
+        # Sum per-incident weights directly: a 2021 LSOA can receive incidents
+        # carrying different `_weight`s in the same month (split 2011 parent at
+        # 1/N alongside an unsplit one at 1), so `_weight.first() * len` would
+        # apply one row's weight to all of them — and nondeterministically so,
+        # since `first` after a join has no ordering guarantee.
+        filtered.group_by("LSOA code", "year", "Crime type")
+        .agg(pl.col("_weight").sum().alias("count"))
        .join(months_per_year, on="year")
        .with_columns(
            (pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
@ -191,10 +194,10 @@ def _write_crime_by_year(
    )

    yearly_per_type = (
-        filtered.group_by("LSOA code", "Crime type", "year", "Month")
-        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
-        .group_by("LSOA code", "Crime type", "year")
-        .agg(pl.col("count").sum().alias("count"))
+        # Per-incident weight sum, not `_weight.first() * len` — see the
+        # matching comment in transform_crime.
+        filtered.group_by("LSOA code", "Crime type", "year")
+        .agg(pl.col("_weight").sum().alias("count"))
        .join(months_per_year, on="year")
        .with_columns(
            (pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -97,6 +97,13 @@ def epc_band_to_year(band: pl.Expr) -> pl.Expr:

 EPC_SOURCE_COLUMNS = [
    "address",
+    # The individual lines behind `address` (= address1+2+3): address2/3
+    # frequently carry a village/locality token that the price-paid address
+    # lacks, so the matcher also scores against address1-only and
+    # address1+address2 variants (see fuzzy_join_on_postcode's variant
+    # columns).
+    "address1",
+    "address2",
    "postcode",
    "uprn",
    "current_energy_rating",
@ -150,6 +157,12 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
    return (
        raw.select(
            _clean_string("address").alias("epc_address"),
+            # Match variants: the full address minus the locality-bearing
+            # trailing lines. Inadmissible variants (ones whose dropped lines
+            # carry numbers or flat designators) are filtered inside the
+            # fuzzy join.
+            _join_address_parts("address1").alias("epc_address_a1"),
+            _join_address_parts("address1", "address2").alias("epc_address_a12"),
            _clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
            # UPRN keys an exact listing->EPC join downstream (~99% populated).
            _clean_string("uprn").alias("uprn"),
@ -536,6 +549,12 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        .filter(pl.col("pp_property_type") != "Other")
        .with_columns(
            _join_address_parts("saon", "paon", "street").alias("pp_address"),
+            # Match variant with the locality appended: the EPC address often
+            # carries a village/locality token the bare saon+paon+street
+            # lacks, which alone drags short addresses below the threshold.
+            _join_address_parts("saon", "paon", "street", "locality").alias(
+                "pp_address_loc"
+            ),
        )
        .with_columns(
            normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
@ -597,6 +616,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        .group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
        .agg(
            pl.col("pp_address").last(),
+            pl.col("pp_address_loc").last(),
            pl.col("postcode").last(),
            pl.col("_pp_match_address").last(),
            pl.col("_pp_match_postcode").last(),
@ -633,6 +653,8 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            right_address_col="epc_address",
            left_postcode_col="postcode",
            right_postcode_col="epc_postcode",
+            left_variant_cols=["pp_address_loc"],
+            right_variant_cols=["epc_address_a1", "epc_address_a12"],
        )
        .drop("epc_postcode")
        # Audit trail: keep the fuzzy-match confidence (100 = exact address
@ -672,6 +694,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
        [
            "old_new",
            "first_transfer_date",
+            "pp_address_loc",
+            "epc_address_a1",
+            "epc_address_a12",
            "_pp_match_address",
            "_pp_match_postcode",
            "_pp_group_address",
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -24,9 +24,12 @@ from pipeline.transform.price_estimation.knn import (
    MIN_COMPARABLE_PSM,
 )
 from pipeline.utils.fuzzy_join import (
+    _NUMBER_RE as _SUFFIXED_NUMBER_RE,
+    _numbers_compatible as _equal_numbers_compatible,
    normalize_address_key,
    normalize_postcode_key,
 )
+from pipeline.utils.normalize import drop_digit_tokens
 from pipeline.utils.postcode_mapping import build_postcode_mapping

 MIN_FLOOR_AREA_M2 = 10
@ -209,8 +212,15 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
    )


-def _numbers_compatible(left: str, right: str) -> bool:
-    """Require address/list-entry numbers to agree when either side has numbers."""
+def _subset_numbers_compatible(left: str, right: str) -> bool:
+    """Require one side's numbers to be a subset of the other's.
+
+    Subset (not equality) is correct ONLY for listed-building name matching: a
+    list entry like "10-12 HIGH STREET" should flag "10 HIGH STREET". Address-
+    to-address matching must use the canonical `fuzzy_join._numbers_compatible`
+    instead (set equality over ``\\d+[A-Z]?`` tokens) — subset semantics there
+    let a single flat absorb its whole building (see fuzzy_join docstring).
+    """
    left_nums = set(_NUMBER_RE.findall(left))
    right_nums = set(_NUMBER_RE.findall(right))
    smaller, larger = (
@ -446,7 +456,7 @@ def _matched_listed_building_flags(
        matched = False
        for address_key in address_keys:
            for listed_name in listed_names:
-                if not _numbers_compatible(address_key, listed_name):
+                if not _subset_numbers_compatible(address_key, listed_name):
                    continue
                if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
                    matched = True
@ -1152,8 +1162,9 @@ def _address_score(query: str, candidate: str | None, *, allow_token_set: bool)
    # token (e.g. "KINGSWOOD") subsets to 100 against any long address that
    # merely contains it — so number-less queries score with token_sort_ratio
    # only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
-    # query the unconditional _numbers_compatible gate has already guaranteed the
-    # candidate carries compatible house numbers, so token_set cannot inflate
+    # query the unconditional fuzzy_join._numbers_compatible gate has already
+    # guaranteed the candidate carries identical house numbers, so token_set
+    # cannot inflate
    # across different addresses; allowing it recovers genuine matches where the
    # scraped listing appends trailing town/county tokens the bare register
    # address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
@ -1213,7 +1224,7 @@ def _rooms_bonus(left: int | None, right: int | None) -> float:
 def _street_only_address(address: str) -> str:
    """The street/locality part of a normalised address: digit-bearing tokens
    (house numbers, flat numbers, including letter suffixes like 8A) removed."""
-    return " ".join(token for token in address.split() if not _NUMBER_RE.search(token))
+    return drop_digit_tokens(address)


 def _is_specific_street_query(query: str) -> bool:
@ -1262,9 +1273,9 @@ def _best_listing_match(
    ``uprn_index`` (postcode-independent, so it is robust even when the
    listing's postcode is slightly off); (2) failing that, the highest
    fuzzy street-address similarity within the listing's own postcode bucket.
-    No property-attribute heuristics are used — `_numbers_compatible` gates
-    every fuzzy match unconditionally (so a number-less listing can never match
-    a numbered property, and vice versa), as in the canonical
+    No property-attribute heuristics are used — `fuzzy_join._numbers_compatible`
+    gates every fuzzy match unconditionally (so a number-less listing can never
+    match a numbered property, and vice versa), as in the canonical
    `fuzzy_join._score_bucket`. A house number additionally lowers the score
    threshold and (via `_address_score`) permits token_set scoring; a number-less
    address scores on token_sort only and must match the street almost exactly.
@ -1294,9 +1305,11 @@ def _best_listing_match(
            address = candidate.get(field)
            if not address:
                continue
-            # Unconditional number gate (matches fuzzy_join): a number-less
-            # listing cannot match a numbered candidate and vice versa.
-            if not _numbers_compatible(query, address):
+            # Unconditional number gate (the canonical fuzzy_join one: set
+            # equality over suffix-aware tokens): a number-less listing cannot
+            # match a numbered candidate, 8A cannot match 8B, and a flat
+            # cannot absorb its whole building.
+            if not _equal_numbers_compatible(query, address):
                continue
            score = _address_score(query, address, allow_token_set=listing_has_numbers)
            if score > best_score:
@ -1388,7 +1401,7 @@ def _best_street_epc_fallback(
        street_score_cache[cache_key] = qualifying

    listing_postcode = listing.get("_listing_match_postcode")
-    listing_numbers = set(_NUMBER_RE.findall(query))
+    listing_numbers = set(_SUFFIXED_NUMBER_RE.findall(query))
    best: dict | None = None
    best_total = float("-inf")
    best_street_score = 0
@ -1417,7 +1430,9 @@ def _best_street_epc_fallback(
            ):
                total += _STREET_FALLBACK_SAME_POSTCODE_BONUS
            if listing_numbers and listing_numbers & set(
-                _NUMBER_RE.findall(candidate.get("_direct_epc_match_address") or "")
+                _SUFFIXED_NUMBER_RE.findall(
+                    candidate.get("_direct_epc_match_address") or ""
+                )
            ):
                total += _STREET_FALLBACK_NUMBER_OVERLAP_BONUS
            if total > best_total:
--- a/pipeline/transform/school_catchments.py
+++ b/pipeline/transform/school_catchments.py
@ -88,6 +88,12 @@ SECONDARY_AGES = (11, 15)
 NURSERY_COHORT_WEIGHT = 0.5  # ages < 4
 SIXTH_FORM_COHORT_WEIGHT = 0.6  # ages >= 16

+# Assumed bounds for the one-sided age-range shapes GIAS emits when a
+# statutory age is missing: "up to {high}" starts at the earliest nursery
+# intake, "{low}+" runs to the end of sixth form.
+EARLIEST_INTAKE_AGE = 2
+DEFAULT_LEAVING_AGE = 19
+
 # Only schools that admit (mostly) by geography take part in the assignment.
 # Independent, special and Welsh schools and post-16 colleges either don't
 # admit by distance or fall outside the England postcode universe; selective
@ -296,11 +302,28 @@ def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
    e.g. "3–11" = ages 3..10) with nursery and sixth-form ages down-weighted,
    and each phase receives the share of cohort weight in its age band.
    """
-    ages = pl.col("age_range").str.extract_all(r"\d+")
-    low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
+    # gias._format_age_range emits three shapes: "{low}–{high}", "up to {high}"
+    # (StatutoryLowAge missing) and "{low}+" (StatutoryHighAge missing). Parse
+    # all three — the one-sided shapes previously fell through the two-number
+    # parse and silently dropped the school from the catchment supply.
+    age = pl.col("age_range")
+    leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int64, strict=False)
+    trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int64, strict=False)
+    low = (
+        pl.when(age.str.starts_with("up to"))
+        .then(pl.lit(EARLIEST_INTAKE_AGE, dtype=pl.Int64))
+        .otherwise(leading)
+    )
    # The leaving age is exclusive as a cohort: a "3-11" school teaches
-    # children aged 3 through 10.
-    high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
+    # children aged 3 through 10. "{low}+" schools get the end of sixth form
+    # as their assumed leaving age (post-19 institutions then carry no
+    # primary/secondary cohort weight and drop out naturally).
+    high = (
+        pl.when(age.str.ends_with("+"))
+        .then(pl.lit(DEFAULT_LEAVING_AGE, dtype=pl.Int64))
+        .otherwise(trailing)
+        - 1
+    )

    schools = (
        gias.filter(
--- a/pipeline/transform/test_crime.py
+++ b/pipeline/transform/test_crime.py
@ -275,6 +275,51 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
    assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]


+def test_transform_crime_sums_mixed_weights_within_a_target_lsoa(tmp_path):
+    """Irregular (M:N) recodes can land rows with DIFFERENT `_weight`s in the
+    same (lsoa21, year, type) group: here E01000050 receives 0.5-weighted
+    incidents from split E01000001 alongside a 1.0-weighted incident from
+    E01000099. The aggregation must sum per-incident weights; the old
+    `_weight.first() * len` applied one row's weight to all three
+    (nondeterministically 1.5 or 3.0 instead of 2.0)."""
+    crime_dir = tmp_path / "crime"
+    month_dir = crime_dir / "2024-01"
+    month_dir.mkdir(parents=True)
+
+    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
+    (month_dir / "2024-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+                "2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
+                "3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
+            ]
+        )
+        + "\n"
+    )
+
+    lookup_path = tmp_path / "lookup.parquet"
+    pl.DataFrame(
+        {
+            "lsoa11": ["E01000001", "E01000001", "E01000099"],
+            "lsoa21": ["E01000050", "E01000051", "E01000050"],
+        }
+    ).write_parquet(lookup_path)
+
+    output = tmp_path / "crime.parquet"
+    by_year_output = tmp_path / "by_year.parquet"
+    transform_crime(crime_dir, output, by_year_output, lookup_path)
+
+    # E01000050: 0.5 + 0.5 + 1.0 = 2.0 incidents -> 24/yr annualised.
+    # E01000051: 0.5 + 0.5 = 1.0 incident -> 12/yr.
+    avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
+    assert avg == [
+        {"LSOA code": "E01000050", "Burglary (avg/yr)": 24.0},
+        {"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
+    ]
+
+
 def test_transform_crime_maps_legacy_crime_types(tmp_path):
    """Pre-2014 police.uk type names are aliased to current equivalents instead
    of being dropped."""
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -25,6 +25,8 @@ def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) ->
 def _row(**overrides: str) -> dict[str, str]:
    row = {
        "address": "1 Example Street",
+        "address1": "1 Example Street",
+        "address2": "Hale",
        "postcode": " aa1 1aa ",
        "uprn": "100012345678",
        "current_energy_rating": "c",
@ -54,6 +56,8 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
    assert df.to_dicts() == [
        {
            "epc_address": "1 Example Street",
+            "epc_address_a1": "1 Example Street",
+            "epc_address_a12": "1 Example Street Hale",
            "epc_postcode": "AA1 1AA",
            "uprn": "100012345678",
            "current_energy_rating": "C",
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -1609,6 +1609,37 @@ def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers(
    assert result is None


+def test_best_listing_match_letter_suffix_flats_do_not_cross_match() -> None:
+    # Regression: the gate uses fuzzy_join's suffix-aware tokens, so "8A" and
+    # "8B" are different numbers. Under the old digit-only tokens both looked
+    # like {8} and token_sort scored ~93, attaching the wrong flat's record
+    # whenever the true candidate was absent from the bucket.
+    candidates = [{"pp_address": "8B HIGH STREET"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="8A HIGH STREET",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is None
+
+
+def test_best_listing_match_building_listing_cannot_absorb_single_flat() -> None:
+    # Regression: set equality (not subset) over number tokens, so a whole-
+    # building listing "188 GREAT NORTH WAY" no longer matches "FLAT 1 188
+    # GREAT NORTH WAY" (token_set would have scored the pair 100).
+    candidates = [{"pp_address": "FLAT 1 188 GREAT NORTH WAY"}]
+    result = _best_listing_match(
+        listing_uprn=None,
+        query="188 GREAT NORTH WAY",
+        uprn_index={},
+        bucket_candidates=candidates,
+        addressed_fields=["pp_address"],
+    )
+    assert result is None
+
+
 def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
    None
 ):
--- a/pipeline/transform/test_school_catchments.py
+++ b/pipeline/transform/test_school_catchments.py
@ -191,6 +191,28 @@ def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
    assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]


+def test_phase_intakes_parses_one_sided_age_ranges():
+    """gias._format_age_range emits "up to {high}" and "{low}+" when a
+    statutory age is missing; those schools must stay in the catchment supply
+    instead of being silently dropped by a two-number parse."""
+    intakes = phase_intakes(
+        pl.DataFrame(
+            [
+                # "up to 11" = assumed cohorts 2..10: nursery years 2-3 weigh
+                # 0.5 each, primary 4..10 weighs 7 -> primary 210 * 7/8.
+                _gias_row(1, age_range="up to 11", pupils=210),
+                # "16+" = assumed cohorts 16..18, all sixth form: no
+                # primary/secondary intake, so the school contributes nothing
+                # but must not crash the parse.
+                _gias_row(2, age_range="16+", pupils=400),
+            ]
+        )
+    ).sort("urn")
+    assert intakes["urn"].to_list() == [1, 2]
+    assert intakes["primary_intake"].to_list() == [210.0 * 7 / 8, 0.0]
+    assert intakes["secondary_intake"].to_list() == [0.0, 0.0]
+
+
 def test_phase_intakes_excludes_non_state_and_selective_schools():
    intakes = phase_intakes(
        pl.DataFrame(
--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -5,6 +5,7 @@ import numpy as np
 import polars as pl

 from pipeline.utils.england_geometry import in_england_mask
+from pipeline.utils.normalize import strip_or_empty

 DROP_CATEGORIES = {
    # GEOLYTIX Grocery Retail Points is the authoritative supermarket source
@ -1313,9 +1314,7 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {


 def normalize_grocery_retailer(retailer: str | None) -> str:
-    if retailer is None:
-        return ""
-    retailer = retailer.strip()
+    retailer = strip_or_empty(retailer)
    if retailer in COOP_RETAILERS:
        return "Co-op"
    return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)