SPlit up
This commit is contained in:
parent
cf39ad754e
commit
f59d01227b
91 changed files with 10370 additions and 7562 deletions
|
|
@ -123,10 +123,13 @@ def transform_crime(
|
|||
)
|
||||
|
||||
yearly_counts = (
|
||||
filtered.group_by("LSOA code", "year", "Crime type", "Month")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "year", "Crime type")
|
||||
.agg(pl.col("count").sum().alias("count"))
|
||||
# Sum per-incident weights directly: a 2021 LSOA can receive incidents
|
||||
# carrying different `_weight`s in the same month (split 2011 parent at
|
||||
# 1/N alongside an unsplit one at 1), so `_weight.first() * len` would
|
||||
# apply one row's weight to all of them — and nondeterministically so,
|
||||
# since `first` after a join has no ordering guarantee.
|
||||
filtered.group_by("LSOA code", "year", "Crime type")
|
||||
.agg(pl.col("_weight").sum().alias("count"))
|
||||
.join(months_per_year, on="year")
|
||||
.with_columns(
|
||||
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
|
||||
|
|
@ -191,10 +194,10 @@ def _write_crime_by_year(
|
|||
)
|
||||
|
||||
yearly_per_type = (
|
||||
filtered.group_by("LSOA code", "Crime type", "year", "Month")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "Crime type", "year")
|
||||
.agg(pl.col("count").sum().alias("count"))
|
||||
# Per-incident weight sum, not `_weight.first() * len` — see the
|
||||
# matching comment in transform_crime.
|
||||
filtered.group_by("LSOA code", "Crime type", "year")
|
||||
.agg(pl.col("_weight").sum().alias("count"))
|
||||
.join(months_per_year, on="year")
|
||||
.with_columns(
|
||||
(pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
|
||||
|
|
|
|||
|
|
@ -97,6 +97,13 @@ def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
|||
|
||||
EPC_SOURCE_COLUMNS = [
|
||||
"address",
|
||||
# The individual lines behind `address` (= address1+2+3): address2/3
|
||||
# frequently carry a village/locality token that the price-paid address
|
||||
# lacks, so the matcher also scores against address1-only and
|
||||
# address1+address2 variants (see fuzzy_join_on_postcode's variant
|
||||
# columns).
|
||||
"address1",
|
||||
"address2",
|
||||
"postcode",
|
||||
"uprn",
|
||||
"current_energy_rating",
|
||||
|
|
@ -150,6 +157,12 @@ def _select_epc_columns(raw: pl.LazyFrame) -> pl.LazyFrame:
|
|||
return (
|
||||
raw.select(
|
||||
_clean_string("address").alias("epc_address"),
|
||||
# Match variants: the full address minus the locality-bearing
|
||||
# trailing lines. Inadmissible variants (ones whose dropped lines
|
||||
# carry numbers or flat designators) are filtered inside the
|
||||
# fuzzy join.
|
||||
_join_address_parts("address1").alias("epc_address_a1"),
|
||||
_join_address_parts("address1", "address2").alias("epc_address_a12"),
|
||||
_clean_string("postcode").str.to_uppercase().alias("epc_postcode"),
|
||||
# UPRN keys an exact listing->EPC join downstream (~99% populated).
|
||||
_clean_string("uprn").alias("uprn"),
|
||||
|
|
@ -536,6 +549,12 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
.filter(pl.col("pp_property_type") != "Other")
|
||||
.with_columns(
|
||||
_join_address_parts("saon", "paon", "street").alias("pp_address"),
|
||||
# Match variant with the locality appended: the EPC address often
|
||||
# carries a village/locality token the bare saon+paon+street
|
||||
# lacks, which alone drags short addresses below the threshold.
|
||||
_join_address_parts("saon", "paon", "street", "locality").alias(
|
||||
"pp_address_loc"
|
||||
),
|
||||
)
|
||||
.with_columns(
|
||||
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
|
||||
|
|
@ -597,6 +616,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
.group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
|
||||
.agg(
|
||||
pl.col("pp_address").last(),
|
||||
pl.col("pp_address_loc").last(),
|
||||
pl.col("postcode").last(),
|
||||
pl.col("_pp_match_address").last(),
|
||||
pl.col("_pp_match_postcode").last(),
|
||||
|
|
@ -633,6 +653,8 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
right_address_col="epc_address",
|
||||
left_postcode_col="postcode",
|
||||
right_postcode_col="epc_postcode",
|
||||
left_variant_cols=["pp_address_loc"],
|
||||
right_variant_cols=["epc_address_a1", "epc_address_a12"],
|
||||
)
|
||||
.drop("epc_postcode")
|
||||
# Audit trail: keep the fuzzy-match confidence (100 = exact address
|
||||
|
|
@ -672,6 +694,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
[
|
||||
"old_new",
|
||||
"first_transfer_date",
|
||||
"pp_address_loc",
|
||||
"epc_address_a1",
|
||||
"epc_address_a12",
|
||||
"_pp_match_address",
|
||||
"_pp_match_postcode",
|
||||
"_pp_group_address",
|
||||
|
|
|
|||
|
|
@ -24,9 +24,12 @@ from pipeline.transform.price_estimation.knn import (
|
|||
MIN_COMPARABLE_PSM,
|
||||
)
|
||||
from pipeline.utils.fuzzy_join import (
|
||||
_NUMBER_RE as _SUFFIXED_NUMBER_RE,
|
||||
_numbers_compatible as _equal_numbers_compatible,
|
||||
normalize_address_key,
|
||||
normalize_postcode_key,
|
||||
)
|
||||
from pipeline.utils.normalize import drop_digit_tokens
|
||||
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
||||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
|
|
@ -209,8 +212,15 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
|
|||
)
|
||||
|
||||
|
||||
def _numbers_compatible(left: str, right: str) -> bool:
|
||||
"""Require address/list-entry numbers to agree when either side has numbers."""
|
||||
def _subset_numbers_compatible(left: str, right: str) -> bool:
|
||||
"""Require one side's numbers to be a subset of the other's.
|
||||
|
||||
Subset (not equality) is correct ONLY for listed-building name matching: a
|
||||
list entry like "10-12 HIGH STREET" should flag "10 HIGH STREET". Address-
|
||||
to-address matching must use the canonical `fuzzy_join._numbers_compatible`
|
||||
instead (set equality over ``\\d+[A-Z]?`` tokens) — subset semantics there
|
||||
let a single flat absorb its whole building (see fuzzy_join docstring).
|
||||
"""
|
||||
left_nums = set(_NUMBER_RE.findall(left))
|
||||
right_nums = set(_NUMBER_RE.findall(right))
|
||||
smaller, larger = (
|
||||
|
|
@ -446,7 +456,7 @@ def _matched_listed_building_flags(
|
|||
matched = False
|
||||
for address_key in address_keys:
|
||||
for listed_name in listed_names:
|
||||
if not _numbers_compatible(address_key, listed_name):
|
||||
if not _subset_numbers_compatible(address_key, listed_name):
|
||||
continue
|
||||
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
|
||||
matched = True
|
||||
|
|
@ -1152,8 +1162,9 @@ def _address_score(query: str, candidate: str | None, *, allow_token_set: bool)
|
|||
# token (e.g. "KINGSWOOD") subsets to 100 against any long address that
|
||||
# merely contains it — so number-less queries score with token_sort_ratio
|
||||
# only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
|
||||
# query the unconditional _numbers_compatible gate has already guaranteed the
|
||||
# candidate carries compatible house numbers, so token_set cannot inflate
|
||||
# query the unconditional fuzzy_join._numbers_compatible gate has already
|
||||
# guaranteed the candidate carries identical house numbers, so token_set
|
||||
# cannot inflate
|
||||
# across different addresses; allowing it recovers genuine matches where the
|
||||
# scraped listing appends trailing town/county tokens the bare register
|
||||
# address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
|
||||
|
|
@ -1213,7 +1224,7 @@ def _rooms_bonus(left: int | None, right: int | None) -> float:
|
|||
def _street_only_address(address: str) -> str:
|
||||
"""The street/locality part of a normalised address: digit-bearing tokens
|
||||
(house numbers, flat numbers, including letter suffixes like 8A) removed."""
|
||||
return " ".join(token for token in address.split() if not _NUMBER_RE.search(token))
|
||||
return drop_digit_tokens(address)
|
||||
|
||||
|
||||
def _is_specific_street_query(query: str) -> bool:
|
||||
|
|
@ -1262,9 +1273,9 @@ def _best_listing_match(
|
|||
``uprn_index`` (postcode-independent, so it is robust even when the
|
||||
listing's postcode is slightly off); (2) failing that, the highest
|
||||
fuzzy street-address similarity within the listing's own postcode bucket.
|
||||
No property-attribute heuristics are used — `_numbers_compatible` gates
|
||||
every fuzzy match unconditionally (so a number-less listing can never match
|
||||
a numbered property, and vice versa), as in the canonical
|
||||
No property-attribute heuristics are used — `fuzzy_join._numbers_compatible`
|
||||
gates every fuzzy match unconditionally (so a number-less listing can never
|
||||
match a numbered property, and vice versa), as in the canonical
|
||||
`fuzzy_join._score_bucket`. A house number additionally lowers the score
|
||||
threshold and (via `_address_score`) permits token_set scoring; a number-less
|
||||
address scores on token_sort only and must match the street almost exactly.
|
||||
|
|
@ -1294,9 +1305,11 @@ def _best_listing_match(
|
|||
address = candidate.get(field)
|
||||
if not address:
|
||||
continue
|
||||
# Unconditional number gate (matches fuzzy_join): a number-less
|
||||
# listing cannot match a numbered candidate and vice versa.
|
||||
if not _numbers_compatible(query, address):
|
||||
# Unconditional number gate (the canonical fuzzy_join one: set
|
||||
# equality over suffix-aware tokens): a number-less listing cannot
|
||||
# match a numbered candidate, 8A cannot match 8B, and a flat
|
||||
# cannot absorb its whole building.
|
||||
if not _equal_numbers_compatible(query, address):
|
||||
continue
|
||||
score = _address_score(query, address, allow_token_set=listing_has_numbers)
|
||||
if score > best_score:
|
||||
|
|
@ -1388,7 +1401,7 @@ def _best_street_epc_fallback(
|
|||
street_score_cache[cache_key] = qualifying
|
||||
|
||||
listing_postcode = listing.get("_listing_match_postcode")
|
||||
listing_numbers = set(_NUMBER_RE.findall(query))
|
||||
listing_numbers = set(_SUFFIXED_NUMBER_RE.findall(query))
|
||||
best: dict | None = None
|
||||
best_total = float("-inf")
|
||||
best_street_score = 0
|
||||
|
|
@ -1417,7 +1430,9 @@ def _best_street_epc_fallback(
|
|||
):
|
||||
total += _STREET_FALLBACK_SAME_POSTCODE_BONUS
|
||||
if listing_numbers and listing_numbers & set(
|
||||
_NUMBER_RE.findall(candidate.get("_direct_epc_match_address") or "")
|
||||
_SUFFIXED_NUMBER_RE.findall(
|
||||
candidate.get("_direct_epc_match_address") or ""
|
||||
)
|
||||
):
|
||||
total += _STREET_FALLBACK_NUMBER_OVERLAP_BONUS
|
||||
if total > best_total:
|
||||
|
|
|
|||
|
|
@ -88,6 +88,12 @@ SECONDARY_AGES = (11, 15)
|
|||
NURSERY_COHORT_WEIGHT = 0.5 # ages < 4
|
||||
SIXTH_FORM_COHORT_WEIGHT = 0.6 # ages >= 16
|
||||
|
||||
# Assumed bounds for the one-sided age-range shapes GIAS emits when a
|
||||
# statutory age is missing: "up to {high}" starts at the earliest nursery
|
||||
# intake, "{low}+" runs to the end of sixth form.
|
||||
EARLIEST_INTAKE_AGE = 2
|
||||
DEFAULT_LEAVING_AGE = 19
|
||||
|
||||
# Only schools that admit (mostly) by geography take part in the assignment.
|
||||
# Independent, special and Welsh schools and post-16 colleges either don't
|
||||
# admit by distance or fall outside the England postcode universe; selective
|
||||
|
|
@ -296,11 +302,28 @@ def phase_intakes(gias: pl.DataFrame) -> pl.DataFrame:
|
|||
e.g. "3–11" = ages 3..10) with nursery and sixth-form ages down-weighted,
|
||||
and each phase receives the share of cohort weight in its age band.
|
||||
"""
|
||||
ages = pl.col("age_range").str.extract_all(r"\d+")
|
||||
low = ages.list.get(0, null_on_oob=True).cast(pl.Int64, strict=False)
|
||||
# gias._format_age_range emits three shapes: "{low}–{high}", "up to {high}"
|
||||
# (StatutoryLowAge missing) and "{low}+" (StatutoryHighAge missing). Parse
|
||||
# all three — the one-sided shapes previously fell through the two-number
|
||||
# parse and silently dropped the school from the catchment supply.
|
||||
age = pl.col("age_range")
|
||||
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int64, strict=False)
|
||||
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int64, strict=False)
|
||||
low = (
|
||||
pl.when(age.str.starts_with("up to"))
|
||||
.then(pl.lit(EARLIEST_INTAKE_AGE, dtype=pl.Int64))
|
||||
.otherwise(leading)
|
||||
)
|
||||
# The leaving age is exclusive as a cohort: a "3-11" school teaches
|
||||
# children aged 3 through 10.
|
||||
high = ages.list.get(1, null_on_oob=True).cast(pl.Int64, strict=False) - 1
|
||||
# children aged 3 through 10. "{low}+" schools get the end of sixth form
|
||||
# as their assumed leaving age (post-19 institutions then carry no
|
||||
# primary/secondary cohort weight and drop out naturally).
|
||||
high = (
|
||||
pl.when(age.str.ends_with("+"))
|
||||
.then(pl.lit(DEFAULT_LEAVING_AGE, dtype=pl.Int64))
|
||||
.otherwise(trailing)
|
||||
- 1
|
||||
)
|
||||
|
||||
schools = (
|
||||
gias.filter(
|
||||
|
|
|
|||
|
|
@ -275,6 +275,51 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
|
|||
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
|
||||
|
||||
|
||||
def test_transform_crime_sums_mixed_weights_within_a_target_lsoa(tmp_path):
|
||||
"""Irregular (M:N) recodes can land rows with DIFFERENT `_weight`s in the
|
||||
same (lsoa21, year, type) group: here E01000050 receives 0.5-weighted
|
||||
incidents from split E01000001 alongside a 1.0-weighted incident from
|
||||
E01000099. The aggregation must sum per-incident weights; the old
|
||||
`_weight.first() * len` applied one row's weight to all three
|
||||
(nondeterministically 1.5 or 3.0 instead of 2.0)."""
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
(month_dir / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
"2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
"3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
lookup_path = tmp_path / "lookup.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"lsoa11": ["E01000001", "E01000001", "E01000099"],
|
||||
"lsoa21": ["E01000050", "E01000051", "E01000050"],
|
||||
}
|
||||
).write_parquet(lookup_path)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
by_year_output = tmp_path / "by_year.parquet"
|
||||
transform_crime(crime_dir, output, by_year_output, lookup_path)
|
||||
|
||||
# E01000050: 0.5 + 0.5 + 1.0 = 2.0 incidents -> 24/yr annualised.
|
||||
# E01000051: 0.5 + 0.5 = 1.0 incident -> 12/yr.
|
||||
avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
|
||||
assert avg == [
|
||||
{"LSOA code": "E01000050", "Burglary (avg/yr)": 24.0},
|
||||
{"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
|
||||
]
|
||||
|
||||
|
||||
def test_transform_crime_maps_legacy_crime_types(tmp_path):
|
||||
"""Pre-2014 police.uk type names are aliased to current equivalents instead
|
||||
of being dropped."""
|
||||
|
|
|
|||
|
|
@ -25,6 +25,8 @@ def _write_csv(path: Path, fieldnames: list[str], rows: list[dict[str, str]]) ->
|
|||
def _row(**overrides: str) -> dict[str, str]:
|
||||
row = {
|
||||
"address": "1 Example Street",
|
||||
"address1": "1 Example Street",
|
||||
"address2": "Hale",
|
||||
"postcode": " aa1 1aa ",
|
||||
"uprn": "100012345678",
|
||||
"current_energy_rating": "c",
|
||||
|
|
@ -54,6 +56,8 @@ def test_scan_epc_certificates_supports_legacy_uppercase_csv(tmp_path: Path):
|
|||
assert df.to_dicts() == [
|
||||
{
|
||||
"epc_address": "1 Example Street",
|
||||
"epc_address_a1": "1 Example Street",
|
||||
"epc_address_a12": "1 Example Street Hale",
|
||||
"epc_postcode": "AA1 1AA",
|
||||
"uprn": "100012345678",
|
||||
"current_energy_rating": "C",
|
||||
|
|
|
|||
|
|
@ -1609,6 +1609,37 @@ def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers(
|
|||
assert result is None
|
||||
|
||||
|
||||
def test_best_listing_match_letter_suffix_flats_do_not_cross_match() -> None:
|
||||
# Regression: the gate uses fuzzy_join's suffix-aware tokens, so "8A" and
|
||||
# "8B" are different numbers. Under the old digit-only tokens both looked
|
||||
# like {8} and token_sort scored ~93, attaching the wrong flat's record
|
||||
# whenever the true candidate was absent from the bucket.
|
||||
candidates = [{"pp_address": "8B HIGH STREET"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="8A HIGH STREET",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_best_listing_match_building_listing_cannot_absorb_single_flat() -> None:
|
||||
# Regression: set equality (not subset) over number tokens, so a whole-
|
||||
# building listing "188 GREAT NORTH WAY" no longer matches "FLAT 1 188
|
||||
# GREAT NORTH WAY" (token_set would have scored the pair 100).
|
||||
candidates = [{"pp_address": "FLAT 1 188 GREAT NORTH WAY"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="188 GREAT NORTH WAY",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
|
||||
None
|
||||
):
|
||||
|
|
|
|||
|
|
@ -191,6 +191,28 @@ def test_phase_intakes_prorates_fill_target_over_weighted_cohorts():
|
|||
assert intakes["secondary_intake"].to_list() == [0.0, 500.0, 0.0, 500.0, 1000.0]
|
||||
|
||||
|
||||
def test_phase_intakes_parses_one_sided_age_ranges():
|
||||
"""gias._format_age_range emits "up to {high}" and "{low}+" when a
|
||||
statutory age is missing; those schools must stay in the catchment supply
|
||||
instead of being silently dropped by a two-number parse."""
|
||||
intakes = phase_intakes(
|
||||
pl.DataFrame(
|
||||
[
|
||||
# "up to 11" = assumed cohorts 2..10: nursery years 2-3 weigh
|
||||
# 0.5 each, primary 4..10 weighs 7 -> primary 210 * 7/8.
|
||||
_gias_row(1, age_range="up to 11", pupils=210),
|
||||
# "16+" = assumed cohorts 16..18, all sixth form: no
|
||||
# primary/secondary intake, so the school contributes nothing
|
||||
# but must not crash the parse.
|
||||
_gias_row(2, age_range="16+", pupils=400),
|
||||
]
|
||||
)
|
||||
).sort("urn")
|
||||
assert intakes["urn"].to_list() == [1, 2]
|
||||
assert intakes["primary_intake"].to_list() == [210.0 * 7 / 8, 0.0]
|
||||
assert intakes["secondary_intake"].to_list() == [0.0, 0.0]
|
||||
|
||||
|
||||
def test_phase_intakes_excludes_non_state_and_selective_schools():
|
||||
intakes = phase_intakes(
|
||||
pl.DataFrame(
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ import numpy as np
|
|||
import polars as pl
|
||||
|
||||
from pipeline.utils.england_geometry import in_england_mask
|
||||
from pipeline.utils.normalize import strip_or_empty
|
||||
|
||||
DROP_CATEGORIES = {
|
||||
# GEOLYTIX Grocery Retail Points is the authoritative supermarket source
|
||||
|
|
@ -1313,9 +1314,7 @@ GROCERY_FASCIA_ICON_NAMES: dict[str, str] = {
|
|||
|
||||
|
||||
def normalize_grocery_retailer(retailer: str | None) -> str:
|
||||
if retailer is None:
|
||||
return ""
|
||||
retailer = retailer.strip()
|
||||
retailer = strip_or_empty(retailer)
|
||||
if retailer in COOP_RETAILERS:
|
||||
return "Co-op"
|
||||
return GROCERY_RETAILER_DISPLAY_NAME_OVERRIDES.get(retailer, retailer)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue