Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -36,7 +36,8 @@ GEOGRAPHY_CODE_REPLACEMENTS = {
|
|||
|
||||
def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
|
||||
# Use the detailed 19+1 breakdown to get sub-categories for Asian ethnicity,
|
||||
# then aggregate back to the broad groups plus South Asian / East Asian split.
|
||||
# then aggregate back to the broad groups plus a South Asian / East/SE Asian
|
||||
# split (Indian/Pakistani/Bangladeshi vs Chinese + other East/SE Asian).
|
||||
detailed = df.filter(
|
||||
(pl.col("Ethnicity_type") == "ONS 2021 19+1") & (pl.col("Ethnicity") != "All")
|
||||
)
|
||||
|
|
@ -53,9 +54,13 @@ def _ethnicity_percentages(df: pl.DataFrame) -> pl.DataFrame:
|
|||
"Indian": "South Asian",
|
||||
"Pakistani": "South Asian",
|
||||
"Bangladeshi": "South Asian",
|
||||
"Any Other Asian Background": "South Asian",
|
||||
# East Asian
|
||||
"Chinese": "East Asian",
|
||||
# East / Southeast Asian. The ONS "Any Other Asian Background" bucket is
|
||||
# predominantly East/Southeast Asian (Filipino, Vietnamese, Thai,
|
||||
# Japanese, Korean, ...) rather than South Asian, so route it here rather
|
||||
# than inflating "% South Asian". The split is approximate (the ONS
|
||||
# bucket also holds some South Asian groups such as Sri Lankan/Nepalese).
|
||||
"Chinese": "East/SE Asian",
|
||||
"Any Other Asian Background": "East/SE Asian",
|
||||
# Black
|
||||
"Black African": "Black",
|
||||
"Black Caribbean": "Black",
|
||||
|
|
|
|||
|
|
@ -35,3 +35,31 @@ def test_ethnicity_percentages_recombines_predecessor_lads_by_population():
|
|||
assert cumberland.select("% White", "% South Asian").to_dicts() == [
|
||||
{"% White": 45.0, "% South Asian": 55.0}
|
||||
]
|
||||
|
||||
|
||||
def test_ethnicity_routes_any_other_asian_to_east_se_asian():
|
||||
"""'Any Other Asian Background' and 'Chinese' both fold into '% East/SE Asian'
|
||||
(not '% South Asian'), fixing the East/SE Asian undercount."""
|
||||
rows = [
|
||||
{
|
||||
"Geography_code": "E06000001",
|
||||
"Ethnicity_type": "ONS 2021 19+1",
|
||||
"Ethnicity": ethnicity,
|
||||
"Ethnic Population": pop,
|
||||
"Value1": 0.0,
|
||||
}
|
||||
for ethnicity, pop in [
|
||||
("Chinese", 30),
|
||||
("Any Other Asian Background", 20),
|
||||
("Indian", 50),
|
||||
]
|
||||
]
|
||||
|
||||
result = _ethnicity_percentages(pl.DataFrame(rows))
|
||||
area = result.filter(pl.col("Geography_code") == "E06000001")
|
||||
|
||||
assert "% East/SE Asian" in result.columns
|
||||
assert "% East Asian" not in result.columns
|
||||
assert area.select("% East/SE Asian", "% South Asian").to_dicts() == [
|
||||
{"% East/SE Asian": 50.0, "% South Asian": 50.0}
|
||||
]
|
||||
|
|
|
|||
|
|
@ -222,3 +222,108 @@ def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch,
|
|||
stderr = capsys.readouterr().err
|
||||
assert "active English postcodes" in stderr
|
||||
assert "not active English postcodes" in stderr
|
||||
|
||||
|
||||
def _write_postcode_features(path, rows):
|
||||
pl.DataFrame(rows).write_parquet(path)
|
||||
|
||||
|
||||
def test_validates_postcode_features_valid(tmp_path, monkeypatch):
|
||||
path = tmp_path / "postcode.parquet"
|
||||
_write_postcode_features(
|
||||
path,
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "BB1 1BB"],
|
||||
"lat": [51.5, 53.4],
|
||||
"lon": [-0.1, -2.2],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
"% White": [80.0, 55.0],
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_contaminated_postcode_features(tmp_path, monkeypatch, capsys):
|
||||
path = tmp_path / "postcode.parquet"
|
||||
_write_postcode_features(
|
||||
path,
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "AA1 1AA", "CF10 1AA"], # duplicate AA1 1AA
|
||||
"lat": [51.5, 51.5, None], # Welsh row has null coord
|
||||
"lon": [-0.1, -0.1, None],
|
||||
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
|
||||
"% White": [80.0, 150.0, 90.0], # 150 out of [0,100]
|
||||
},
|
||||
)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--postcode-features", str(path)])
|
||||
assert main() == 1
|
||||
err = capsys.readouterr().err
|
||||
assert "not unique" in err
|
||||
assert "E92000001" in err # country contamination
|
||||
assert "out-of-England" in err or "lat/lon" in err
|
||||
assert "[0, 100]" in err
|
||||
|
||||
|
||||
def test_validates_properties_subset(tmp_path, monkeypatch):
|
||||
postcode = tmp_path / "postcode.parquet"
|
||||
properties = tmp_path / "properties.parquet"
|
||||
pl.DataFrame({"Postcode": ["AA1 1AA", "BB1 1BB"]}).write_parquet(postcode)
|
||||
pl.DataFrame(
|
||||
{"Postcode": ["AA1 1AA"], "Last known price": [250_000]}
|
||||
).write_parquet(properties)
|
||||
monkeypatch.setattr(
|
||||
"sys.argv",
|
||||
["validate", "--properties-subset", f"{properties}::{postcode}"],
|
||||
)
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_orphan_properties(tmp_path, monkeypatch, capsys):
|
||||
postcode = tmp_path / "postcode.parquet"
|
||||
properties = tmp_path / "properties.parquet"
|
||||
pl.DataFrame({"Postcode": ["AA1 1AA"]}).write_parquet(postcode)
|
||||
pl.DataFrame(
|
||||
{"Postcode": ["CC1 1CC"], "Last known price": [-5]} # orphan + negative price
|
||||
).write_parquet(properties)
|
||||
monkeypatch.setattr(
|
||||
"sys.argv",
|
||||
["validate", "--properties-subset", f"{properties}::{postcode}"],
|
||||
)
|
||||
assert main() == 1
|
||||
err = capsys.readouterr().err
|
||||
assert "absent from" in err
|
||||
assert "non-positive" in err
|
||||
|
||||
|
||||
def test_validates_price_index_allows_zero_n_pairs(tmp_path, monkeypatch):
|
||||
path = tmp_path / "price_index.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"sector": ["A1 1", "A1 1", "B2 2"],
|
||||
"type_group": ["All", "Detached", "All"],
|
||||
"year": [2024, 2024, 2024],
|
||||
"log_index": [0.5, 0.4, 0.0],
|
||||
"n_pairs": [100, 0, 0], # zero n_pairs is a legitimate fallback
|
||||
}
|
||||
).write_parquet(path)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
|
||||
assert main() == 0
|
||||
|
||||
|
||||
def test_rejects_price_index_nonfinite_and_duplicate(tmp_path, monkeypatch, capsys):
|
||||
path = tmp_path / "price_index.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"sector": ["A1 1", "A1 1"],
|
||||
"type_group": ["All", "All"], # duplicate (sector, type_group, year)
|
||||
"year": [2024, 2024],
|
||||
"log_index": [float("inf"), 0.3], # non-finite
|
||||
"n_pairs": [10, 10],
|
||||
}
|
||||
).write_parquet(path)
|
||||
monkeypatch.setattr("sys.argv", ["validate", "--price-index", str(path)])
|
||||
assert main() == 1
|
||||
err = capsys.readouterr().err
|
||||
assert "non-finite" in err
|
||||
assert "not unique" in err
|
||||
|
|
|
|||
|
|
@ -28,6 +28,17 @@ MINOR_CRIME_TYPES = (
|
|||
"Other crime",
|
||||
)
|
||||
|
||||
# Legacy police.uk crime-type names (pre-2014 taxonomy) mapped to their closest
|
||||
# current equivalent. Without this, ~1.9M incidents from 2010-2013 ("Violent
|
||||
# crime", "Public disorder and weapons") are unrecognised and silently dropped,
|
||||
# which understates pre-2013 serious crime and creates an artificial 2012->2013
|
||||
# step in the by-year series. Applied with `.replace` (not `.replace_strict`) so
|
||||
# unmapped current types pass through unchanged.
|
||||
LEGACY_CRIME_TYPE_ALIASES = {
|
||||
"Violent crime": "Violence and sexual offences",
|
||||
"Public disorder and weapons": "Public order",
|
||||
}
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
|
|
@ -96,6 +107,7 @@ def transform_crime(
|
|||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
)
|
||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
.group_by("LSOA code", "Month", "Crime type")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
|
|
@ -147,7 +159,10 @@ def _write_crime_by_year(
|
|||
& (pl.col("LSOA code") != "")
|
||||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
|
||||
).with_columns(
|
||||
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
|
||||
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
|
||||
)
|
||||
|
||||
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
|
||||
# Using crime-type-specific months would over-scale years where a rare type appears
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.crime import find_street_crime_csvs
|
||||
from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs
|
||||
|
||||
|
||||
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
|
||||
|
|
@ -80,6 +80,10 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
|
|||
.drop_nulls(["lon", "lat"])
|
||||
.filter(pl.col("lon").is_between(-9.5, 5.0))
|
||||
.filter(pl.col("lat").is_between(49.0, 57.0))
|
||||
# Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
|
||||
# values always match the frontend's canonical filter list (a no-op for
|
||||
# the recent months this overlay normally covers).
|
||||
.with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
.group_by("lon", "lat", "month", "crime_type")
|
||||
.len()
|
||||
.rename({"len": "count"})
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ import shapely
|
|||
from pyproj import Transformer
|
||||
|
||||
from pipeline.transform.crime import (
|
||||
LEGACY_CRIME_TYPE_ALIASES,
|
||||
MINOR_CRIME_TYPES,
|
||||
SERIOUS_CRIME_TYPES,
|
||||
find_street_crime_csvs,
|
||||
|
|
@ -150,6 +151,11 @@ def _accumulate_counts(
|
|||
& (pl.col("Crime type") != "")
|
||||
& pl.col("year").is_in(years)
|
||||
)
|
||||
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
|
||||
# "Public disorder and weapons") to their current equivalents before
|
||||
# indexing, so ~1.9M historical incidents are counted instead of
|
||||
# dropped. `.replace` leaves current types unchanged.
|
||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
# Map crime types to indices with default=None so an unrecognised
|
||||
# type yields a null index we can *report* rather than silently drop
|
||||
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).
|
||||
|
|
|
|||
|
|
@ -18,11 +18,49 @@ from ..utils import (
|
|||
normalize_postcode_key,
|
||||
)
|
||||
|
||||
|
||||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
MIN_PRICE = 50_000
|
||||
|
||||
# Plausible construction-year range; band-derived years outside it (e.g. OCR
|
||||
# noise like 1012 or 2202) are nulled rather than published.
|
||||
MIN_BUILD_YEAR = 1700
|
||||
MAX_BUILD_YEAR = 2030
|
||||
|
||||
|
||||
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
||||
"""Map an EPC construction age band to a single representative build year.
|
||||
|
||||
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
|
||||
(1958) rather than the lower bound, which previously biased every band-derived
|
||||
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
|
||||
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
|
||||
that year. Already-numeric inputs (a year produced by an earlier call) pass
|
||||
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
|
||||
"""
|
||||
text = (
|
||||
band.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
)
|
||||
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
|
||||
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
|
||||
year = (
|
||||
pl.when(text.str.starts_with("before "))
|
||||
.then(None)
|
||||
.when(high.is_not_null())
|
||||
.then(((low + high) / 2).round(0).cast(pl.Int32))
|
||||
.otherwise(low)
|
||||
)
|
||||
return (
|
||||
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
|
||||
.then(year)
|
||||
.otherwise(None)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
||||
|
||||
EPC_SOURCE_COLUMNS = [
|
||||
"address",
|
||||
"postcode",
|
||||
|
|
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
|
||||
# For new-builds (old_new == "Y"), use the first transaction date year as
|
||||
# the exact construction date; otherwise fall back to the EPC age band.
|
||||
epc_band_year = (
|
||||
pl.col("construction_age_band")
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
|
||||
transfer_year = (
|
||||
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,11 @@ from shapely.strtree import STRtree
|
|||
from thefuzz import fuzz
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
MAX_COMPARABLE_PSM,
|
||||
MIN_COMPARABLE_PSM,
|
||||
)
|
||||
from pipeline.utils.fuzzy_join import (
|
||||
normalize_address_key,
|
||||
normalize_postcode_key,
|
||||
|
|
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
|
|||
"Air Quality and Road Safety Score",
|
||||
# Ethnicity
|
||||
"% South Asian",
|
||||
"% East Asian",
|
||||
"% East/SE Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
|
|
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:
|
|||
|
||||
|
||||
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
||||
return (
|
||||
pl.col(column)
|
||||
.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
|
||||
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
|
||||
# implausible years -> null). Already-numeric inputs pass through unchanged.
|
||||
return epc_band_to_year(pl.col(column))
|
||||
|
||||
|
||||
def _address_score(query: str, candidate: str | None) -> int:
|
||||
|
|
@ -1956,7 +1956,9 @@ def _build(
|
|||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
|
||||
# it sorts/filters correctly; null (not a fabricated 10) when no availability
|
||||
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
|
||||
broadband = (
|
||||
pl.scan_parquet(broadband_path)
|
||||
.select(
|
||||
|
|
@ -1969,13 +1971,12 @@ def _build(
|
|||
.then(100)
|
||||
.when(pl.col("SFBB availability (% premises)") > 0)
|
||||
.then(30)
|
||||
.otherwise(10)
|
||||
.otherwise(None)
|
||||
.cast(pl.UInt16)
|
||||
.alias("max_download_speed"),
|
||||
)
|
||||
.group_by("bb_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
|
||||
)
|
||||
area_side_tables = {
|
||||
"iod": iod,
|
||||
|
|
@ -2052,9 +2053,20 @@ def _build(
|
|||
.otherwise(pl.col("current_energy_rating"))
|
||||
.alias("current_energy_rating"),
|
||||
).with_columns(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
# Null out implausible per-sqm values (outside the kNN comparable band):
|
||||
# bulk/block transactions divided by a single unit's floor area otherwise
|
||||
# produce figures up to ~£1.5M/sqm.
|
||||
pl.when(
|
||||
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
& (
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
|
||||
)
|
||||
)
|
||||
.then(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
|
||||
)
|
||||
.otherwise(None)
|
||||
.alias("Price per sqm"),
|
||||
)
|
||||
wide = _finalize_merged_columns(wide)
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
|
||||
from pyproj import Transformer
|
||||
from shapely import make_valid, set_precision
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
from shapely.ops import unary_union
|
||||
|
|
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
|
|||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
|
||||
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
||||
just the intermediate Shapely object: coordinate snapping during
|
||||
serialization can otherwise leave a self-intersecting ring that only shows up
|
||||
once the feature is read back from disk. Any such geometry is repaired with
|
||||
``make_valid`` before returning so written features are always valid.
|
||||
"""
|
||||
geom = _largest_polygonal(geom)
|
||||
if geom is None:
|
||||
return None
|
||||
|
|
@ -55,12 +63,28 @@ def to_wgs84_geojson(
|
|||
|
||||
transformer = _get_to_wgs84()
|
||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
try:
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
except GEOSException:
|
||||
# Precision snapping can fail on pathological geometries; fall back to a
|
||||
# plain validity repair without coordinate snapping.
|
||||
wgs84 = make_valid(wgs84)
|
||||
wgs84 = _largest_polygonal(wgs84)
|
||||
if wgs84 is None:
|
||||
return None
|
||||
|
||||
return mapping(wgs84)
|
||||
geojson_dict = mapping(wgs84)
|
||||
|
||||
# The geometry that actually reaches disk is the GeoJSON dict, so validate
|
||||
# *that* (not the pre-serialization object) and repair if needed.
|
||||
round_trip = shape(geojson_dict)
|
||||
if round_trip.is_empty or not round_trip.is_valid:
|
||||
round_trip = _largest_polygonal(make_valid(round_trip))
|
||||
if round_trip is None or round_trip.is_empty:
|
||||
return None
|
||||
geojson_dict = mapping(round_trip)
|
||||
|
||||
return geojson_dict
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
|
|
@ -119,7 +143,11 @@ def merge_fragments(
|
|||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _fill_holes(combined)
|
||||
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
||||
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
||||
# Filling them would re-add the removed area and negate the
|
||||
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
|
||||
# chain were already removed by the _fill_holes above (pre-subtraction).
|
||||
# Revert if subtraction + fragment selection lost >90% of area
|
||||
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
|
||||
combined = pre_green
|
||||
|
|
|
|||
|
|
@ -893,3 +893,54 @@ class TestSubtractGreenspace:
|
|||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
||||
|
||||
class TestToWgs84GeojsonValidity:
|
||||
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
|
||||
|
||||
def test_geojson_round_trips_to_valid_geometry(self):
|
||||
from shapely.geometry import shape
|
||||
|
||||
geojson = to_wgs84_geojson(box(530000, 180000, 530100, 180100))
|
||||
assert geojson is not None
|
||||
rt = shape(geojson)
|
||||
assert not rt.is_empty
|
||||
assert rt.is_valid
|
||||
|
||||
def test_written_district_features_are_all_valid(self, tmp_path):
|
||||
from shapely.geometry import shape
|
||||
|
||||
postcodes = {
|
||||
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||
"AA1 1AB": MultiPolygon(
|
||||
[
|
||||
box(530200, 180000, 530250, 180050),
|
||||
box(530200, 180060, 530250, 180110),
|
||||
]
|
||||
),
|
||||
}
|
||||
assert write_district_geojson(postcodes, tmp_path) == 1
|
||||
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
for feature in collection["features"]:
|
||||
geom = shape(feature["geometry"])
|
||||
assert geom.is_valid
|
||||
assert not geom.is_empty
|
||||
|
||||
|
||||
class TestGreenspaceHolePreserved:
|
||||
"""Interior holes carved by greenspace subtraction must survive merge_fragments
|
||||
(the post-subtraction _fill_holes that previously negated them was removed)."""
|
||||
|
||||
def test_interior_lake_hole_survives_merge_fragments(self):
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
lake = box(30, 30, 70, 70) # 1600 sqm fully-interior hole (16% removal)
|
||||
result = merge_fragments(
|
||||
[("TEST1", postcode)],
|
||||
greenspace_tree=STRtree([lake]),
|
||||
greenspace_geoms=[lake],
|
||||
)
|
||||
merged = result["TEST1"]
|
||||
assert len(list(merged.interiors)) == 1
|
||||
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
|
||||
|
|
|
|||
|
|
@ -67,6 +67,16 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
.struct.field("price")
|
||||
.alias("input_price"),
|
||||
)
|
||||
.with_columns(
|
||||
# Date of the input (second-to-last) sale, used by the kNN leakage
|
||||
# filter to exclude the target property's own prior sale from its
|
||||
# comparables. Built from year+month (day defaults to the 1st).
|
||||
pl.date(
|
||||
pl.col("input_year").cast(pl.Int32),
|
||||
pl.col("input_month").cast(pl.Int32),
|
||||
1,
|
||||
).alias("input_date"),
|
||||
)
|
||||
.with_columns(
|
||||
(
|
||||
pl.col("actual_year").cast(pl.Float64)
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ import polars as pl
|
|||
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
MAX_COMPARABLE_PSM,
|
||||
MIN_COMPARABLE_PSM,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
|
|
@ -31,7 +33,13 @@ from pipeline.transform.price_estimation.utils import (
|
|||
|
||||
MAX_KNN_TO_INDEX_RATIO = 2.0
|
||||
MIN_KNN_TO_INDEX_RATIO = 0.5
|
||||
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
|
||||
# Cap the final estimate at this multiple of the last known price as a guard
|
||||
# against data errors. Set to ~exp(MAX_LOG_ADJUSTMENT) (~20x) so it is
|
||||
# consistent with the log-index clip already applied to the index move: many
|
||||
# UK sectors legitimately grew >6x since the 1990s (e.g. parts of inner London
|
||||
# 12-14x), so the previous 6x cap truncated genuine appreciation rather than
|
||||
# only catching outliers.
|
||||
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0
|
||||
|
||||
|
||||
def guarded_blend_estimates(
|
||||
|
|
@ -222,11 +230,22 @@ def main():
|
|||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
# Derive estimated price per sqm where both estimated price and floor area
|
||||
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
|
||||
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
|
||||
# from bulk/block transactions or floor-area errors and are not meaningful
|
||||
# per-unit prices.
|
||||
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& (_est_psm >= MIN_COMPARABLE_PSM)
|
||||
& (_est_psm <= MAX_COMPARABLE_PSM)
|
||||
)
|
||||
.then(_est_psm.round(0).cast(pl.Int32, strict=False))
|
||||
.otherwise(None)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
extract_centroids,
|
||||
|
|
@ -165,12 +166,50 @@ def solve_robust_index(
|
|||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
# Temporal smoothness prior: penalise curvature in the year betas with a
|
||||
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
|
||||
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
|
||||
# zero target). This damps single-year index spikes without flattening
|
||||
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
|
||||
# year (min_year, implicit beta=0) has no column, so the penalty spans the
|
||||
# non-baseline years only. For cells with <3 betas there is no curvature to
|
||||
# penalise and the solve is unchanged.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cols_by_year = [c for _, c in sorted(year_to_col.items())]
|
||||
n_pen = n_cols - 2
|
||||
pen_rows = np.repeat(n + np.arange(n_pen), 3)
|
||||
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
|
||||
for k in range(n_pen):
|
||||
pen_cols[3 * k : 3 * k + 3] = (
|
||||
cols_by_year[k],
|
||||
cols_by_year[k + 1],
|
||||
cols_by_year[k + 2],
|
||||
)
|
||||
pen_rows_arr = pen_rows.astype(np.int64)
|
||||
pen_cols_arr = pen_cols
|
||||
pen_vals_arr = np.tile(
|
||||
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
|
||||
).astype(np.float64)
|
||||
pen_b = np.zeros(n_pen, dtype=np.float64)
|
||||
n_total_rows = n + n_pen
|
||||
|
||||
weights = base_weights.copy()
|
||||
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
if n_pen:
|
||||
all_data = np.concatenate([data, pen_vals_arr])
|
||||
all_rows = np.concatenate([rows_arr, pen_rows_arr])
|
||||
all_cols = np.concatenate([cols_arr, pen_cols_arr])
|
||||
b = np.concatenate([log_ratios * weights, pen_b])
|
||||
else:
|
||||
all_data, all_rows, all_cols = data, rows_arr, cols_arr
|
||||
b = log_ratios * weights
|
||||
A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
|
|
|
|||
|
|
@ -96,8 +96,11 @@ def spatial_smooth(
|
|||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = counts.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue # enough data, skip smoothing
|
||||
if self_w > 0.90:
|
||||
# Enough data, skip smoothing. Relaxed from 0.95 so higher-volume
|
||||
# cells (n ~270-570) that still carry single-year noise get a light
|
||||
# spatial blend, complementing the temporal smoothness prior.
|
||||
continue
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
# Skip self (index 0, distance ~0)
|
||||
|
|
|
|||
|
|
@ -81,8 +81,21 @@ def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
|
|||
last_prices=np.array([100_000.0, 100_000.0]),
|
||||
)
|
||||
|
||||
# Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
|
||||
assert blended[0] == 120_000.0
|
||||
assert blended[1] == 600_000.0
|
||||
# Property 1: a 10x uplift over the last price is legitimate appreciation and
|
||||
# is no longer truncated (cap raised from 6x to 20x).
|
||||
assert blended[1] == 1_000_000.0
|
||||
|
||||
|
||||
def test_guarded_blend_caps_uplift_at_20x_last_price():
|
||||
# 50x index estimate over the last price is capped at the 20x ceiling.
|
||||
blended = guarded_blend_estimates(
|
||||
index_est=np.array([5_000_000.0]),
|
||||
knn_est=np.array([np.nan]),
|
||||
last_prices=np.array([100_000.0]),
|
||||
)
|
||||
assert blended[0] == 2_000_000.0 # 100_000 * 20
|
||||
|
||||
|
||||
def test_bungalow_is_not_a_dead_price_index_type_group():
|
||||
|
|
@ -92,3 +105,50 @@ def test_bungalow_is_not_a_dead_price_index_type_group():
|
|||
|
||||
assert "Bungalow" not in TYPE_GROUPS
|
||||
assert df["type_group"].to_list() == [None, None]
|
||||
|
||||
|
||||
def test_temporal_regularization_damps_curvature_without_breaking_solve():
|
||||
"""The second-difference prior reduces year-to-year curvature and keeps the
|
||||
index well-formed (all years present, finite, contiguous)."""
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
|
||||
years = np.arange(2010, 2021)
|
||||
true = {y: 0.04 * (y - 2010) for y in years}
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
for y in years[:-1]: # adjacent-year pairs following a smooth trend
|
||||
y1.append(y)
|
||||
y2.append(y + 1)
|
||||
lr.append(true[y + 1] - true[y])
|
||||
w.append(1.0)
|
||||
# A spurious single-year jump at 2015 (poorly identified curvature spike).
|
||||
y1.append(2014)
|
||||
y2.append(2015)
|
||||
lr.append(0.5)
|
||||
w.append(1.0)
|
||||
y1, y2 = np.array(y1), np.array(y2)
|
||||
lr, w = np.array(lr, float), np.array(w, float)
|
||||
|
||||
def solve(lmbda):
|
||||
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
|
||||
try:
|
||||
return index_mod.solve_robust_index(y1, y2, lr, w)
|
||||
finally:
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
|
||||
|
||||
unregularised = solve(0.0)
|
||||
regularised = solve(0.2)
|
||||
|
||||
# Index is well-formed for both.
|
||||
assert set(regularised) == set(range(2010, 2021))
|
||||
assert all(np.isfinite(v) for v in regularised.values())
|
||||
assert regularised[2010] == 0.0 # baseline year pinned to 0
|
||||
|
||||
def max_curvature(d):
|
||||
betas = np.array([d[y] for y in sorted(d)])
|
||||
return float(np.abs(np.diff(betas, 2)).max())
|
||||
|
||||
# Regularisation strictly reduces curvature, and never flattens the genuine
|
||||
# uptrend (the index still rises end to end).
|
||||
assert max_curvature(regularised) < max_curvature(unregularised)
|
||||
assert regularised[2020] > regularised[2010]
|
||||
|
|
|
|||
|
|
@ -22,6 +22,13 @@ FLAT_TYPES = ["Flats/Maisonettes"]
|
|||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
|
||||
SHRINKAGE_K = 50
|
||||
|
||||
# Temporal regularization for the repeat-sales index: a second-difference
|
||||
# (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added
|
||||
# to the IRLS solve. A mild penalty damps single-year index spikes (which would
|
||||
# otherwise distort the estimate of any property whose last sale landed on a
|
||||
# noisy year) without flattening genuine multi-year trends.
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type -> type_group."""
|
||||
|
|
|
|||
|
|
@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
|
|||
}
|
||||
|
||||
|
||||
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
|
||||
returning a ``(postcode, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
Framework). A large and growing share of schools were last inspected under an
|
||||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
|
||||
variants). Filtering on the graded column alone dropped ~7,000 genuinely
|
||||
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
|
||||
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
|
||||
is never overridden.
|
||||
"""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
graded = (
|
||||
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
||||
.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2"]))
|
||||
.then(oeif)
|
||||
.when(
|
||||
no_usable_grade
|
||||
& ungraded.str.starts_with("School remains Outstanding")
|
||||
)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
.filter(pl.col("_ofsted_grade").is_not_null())
|
||||
)
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
return graded.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
.alias("category")
|
||||
).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Count good+ and outstanding primary/secondary schools near each postcode"
|
||||
|
|
@ -30,42 +90,14 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
|
||||
# Post-2025 reform the single "Overall effectiveness" grade was retired;
|
||||
# the legacy 1–4 scale is now carried forward under "Latest OEIF overall
|
||||
# effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
# Framework). The new report-card columns use text judgements instead.
|
||||
ofsted = pl.read_parquet(args.ofsted).filter(
|
||||
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
|
||||
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
|
||||
)
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
print(f"Good+ schools: {len(ofsted):,}")
|
||||
print(
|
||||
"Outstanding schools: "
|
||||
f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
|
||||
)
|
||||
|
||||
# Assign category based on phase and rating. Good+ groups include both
|
||||
# category variants; outstanding groups count grade 1 only.
|
||||
ofsted = ofsted.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
.alias("category")
|
||||
).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
|
||||
)
|
||||
|
||||
# Join with arcgis to get lat/lng for each school's postcode
|
||||
|
|
|
|||
|
|
@ -226,3 +226,44 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
|
|||
assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
|
||||
assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
|
||||
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
|
||||
|
||||
|
||||
def test_transform_crime_maps_legacy_crime_types(tmp_path):
|
||||
"""Pre-2014 police.uk type names are aliased to current equivalents instead
|
||||
of being dropped."""
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2013-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
(month_dir / "2013-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"1,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Violent crime,Under investigation,",
|
||||
"2,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Public disorder and weapons,Under investigation,",
|
||||
"3,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Burglary,Under investigation,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
by_year_output = tmp_path / "crime_by_year.parquet"
|
||||
transform_crime(crime_dir, output, by_year_output)
|
||||
|
||||
row = pl.read_parquet(output).to_dicts()[0]
|
||||
# Single month -> annualised x12. Legacy names mapped to current columns.
|
||||
assert row["Violence and sexual offences (avg/yr)"] == 12.0
|
||||
assert row["Public order (avg/yr)"] == 12.0
|
||||
assert row["Burglary (avg/yr)"] == 12.0
|
||||
# The legacy names must NOT survive as their own columns.
|
||||
assert "Violent crime (avg/yr)" not in row
|
||||
assert "Public disorder and weapons (avg/yr)" not in row
|
||||
|
||||
by_year = pl.read_parquet(by_year_output).row(0, named=True)
|
||||
serious = {p["year"]: p["count"] for p in by_year["Serious crime (by year)"]}
|
||||
# Serious = Violence and sexual offences (12) + Burglary (12) = 24
|
||||
assert serious[2013] == 24.0
|
||||
minor = {p["year"]: p["count"] for p in by_year["Minor crime (by year)"]}
|
||||
assert minor[2013] == 12.0 # Public order
|
||||
|
|
|
|||
|
|
@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
|||
err = capsys.readouterr().err
|
||||
assert "Cyber fraud" in err
|
||||
assert "WARNING" in err
|
||||
|
||||
|
||||
def test_legacy_crime_types_are_mapped(tmp_path):
|
||||
"""Pre-2014 crime-type names are aliased to current equivalents in the
|
||||
spatial transform instead of being dropped as unknown types."""
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(
|
||||
crime,
|
||||
"2013-01",
|
||||
[
|
||||
_crime_row("2013-01", 1005, 1005, "Violent crime"),
|
||||
_crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
|
||||
],
|
||||
)
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
row = pl.read_parquet(output).to_dicts()[0]
|
||||
# Single postcode -> area-norm factor 1.0; single month/year -> x12.
|
||||
assert row["Violence and sexual offences (avg/yr)"] == 12.0
|
||||
assert row["Public order (avg/yr)"] == 12.0
|
||||
|
||||
by_year_row = pl.read_parquet(by_year).row(0, named=True)
|
||||
assert by_year_row["Violence and sexual offences (by year)"] == [
|
||||
{"year": 2013, "count": 12.0}
|
||||
]
|
||||
assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]
|
||||
|
|
|
|||
|
|
@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
"epc_address": "1 Example Street",
|
||||
"current_energy_rating": "C",
|
||||
"total_floor_area": 85.0,
|
||||
"construction_age_band": 1950,
|
||||
# Band midpoint of 1950-1966, not the lower bound.
|
||||
"construction_age_band": 1958,
|
||||
"was_council_house": "Yes",
|
||||
}
|
||||
]
|
||||
|
|
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
|||
"current_energy_rating": None,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.join_epc_pp import epc_band_to_year
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"b": [
|
||||
"England and Wales: 1950-1966", # midpoint 1958
|
||||
"1900-1929", # midpoint 1914
|
||||
"England and Wales: before 1900", # too wide -> null
|
||||
"2012 onwards", # single year
|
||||
"1012", # implausible -> null
|
||||
"2202", # implausible -> null
|
||||
None, # null -> null
|
||||
"1958", # already-numeric-as-string -> pass through
|
||||
]
|
||||
}
|
||||
)
|
||||
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
|
||||
assert years == [1958, 1914, None, 2012, None, None, None, 1958]
|
||||
|
|
|
|||
82
pipeline/transform/test_school_proximity.py
Normal file
82
pipeline/transform/test_school_proximity.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.school_proximity import classify_good_plus_schools
|
||||
|
||||
|
||||
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
|
||||
return {
|
||||
"Postcode": postcode,
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": ungraded,
|
||||
}
|
||||
|
||||
|
||||
def _classify(rows):
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows))
|
||||
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
|
||||
|
||||
|
||||
def test_legacy_oeif_grades_1_and_2_are_kept():
|
||||
rows = [
|
||||
_school("Primary", "1", None, "AA1 1AA"),
|
||||
_school("Primary", "2", None, "AA1 1AB"),
|
||||
_school("Secondary", "1", None, "AA1 1AC"),
|
||||
_school("Secondary", "2", None, "AA1 1AD"),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "outstanding_primary"),
|
||||
("AA1 1AB", "good_primary"),
|
||||
("AA1 1AC", "outstanding_secondary"),
|
||||
("AA1 1AD", "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_grades_3_and_4_are_excluded():
|
||||
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
||||
# Null and "Not judged" OEIF fall back to the ungraded outcome.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good", "AA1 1AA"),
|
||||
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
|
||||
# "(Concerns)"/"(Improving)" variants are still good+.
|
||||
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
|
||||
_school(
|
||||
"Secondary",
|
||||
None,
|
||||
"School remains Outstanding (Concerns) - S5 Next",
|
||||
"AA1 1AD",
|
||||
),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AC", "good_primary"),
|
||||
("AA1 1AD", "outstanding_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_ungraded_non_good_outcomes_are_excluded():
|
||||
rows = [
|
||||
_school("Primary", None, "Some aspects not as strong"),
|
||||
_school("Primary", None, "Standards maintained"),
|
||||
_school("Primary", None, None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
|
||||
# A real grade 3 must not be promoted by an ungraded "remains Good".
|
||||
rows = [_school("Primary", "3", "School remains Good")]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_non_primary_secondary_phases_excluded():
|
||||
rows = [
|
||||
_school("Nursery", "1", None),
|
||||
_school("Not applicable", "2", None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
|
@ -352,6 +352,176 @@ def _failures_for_active_postcode_boundary_match(spec: str) -> list[str]:
|
|||
return failures
|
||||
|
||||
|
||||
def _failures_for_postcode_features(path: Path) -> list[str]:
|
||||
"""Validate the postcode feature output: unique Postcode, non-null lat/lon
|
||||
inside the England bbox, ctry25cd == E92000001, and every '% ' column in
|
||||
[0, 100]. Mirrors the in-build invariant (merge._validate_postcode_feature_output)
|
||||
so a stale/contaminated file on disk cannot pass `make`.
|
||||
"""
|
||||
failures = _failures_for_parquet(path)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
names = pl.scan_parquet(path).collect_schema().names()
|
||||
required = {"Postcode", "lat", "lon", "ctry25cd"}
|
||||
missing = sorted(required - set(names))
|
||||
if missing:
|
||||
return [f"{path}: postcode features missing required columns: {missing}"]
|
||||
|
||||
pct_cols = [c for c in names if c.startswith("% ")]
|
||||
df = (
|
||||
pl.scan_parquet(path)
|
||||
.select(["Postcode", "lat", "lon", "ctry25cd", *pct_cols])
|
||||
.collect()
|
||||
)
|
||||
except Exception as exc:
|
||||
return [f"{path}: postcode features validation failed: {exc}"]
|
||||
|
||||
height = df.height
|
||||
if df["Postcode"].n_unique() != height:
|
||||
failures.append(
|
||||
f"{path}: Postcode is not unique "
|
||||
f"({height - df['Postcode'].n_unique():,} duplicate rows)"
|
||||
)
|
||||
|
||||
# England bounding box (generous): lat 49.5-60N, lon -8 to 2.5E.
|
||||
bad_coords = df.filter(
|
||||
pl.col("lat").is_null()
|
||||
| pl.col("lon").is_null()
|
||||
| ~pl.col("lat").is_between(49.5, 60.0)
|
||||
| ~pl.col("lon").is_between(-8.0, 2.5)
|
||||
)
|
||||
if bad_coords.height:
|
||||
sample = bad_coords.get_column("Postcode").head(10).to_list()
|
||||
failures.append(
|
||||
f"{path}: {bad_coords.height:,} rows have null or out-of-England "
|
||||
f"lat/lon; sample: {_format_samples(sample)}"
|
||||
)
|
||||
|
||||
bad_country = df.filter(pl.col("ctry25cd") != "E92000001")
|
||||
if bad_country.height:
|
||||
sample = bad_country.get_column("Postcode").head(10).to_list()
|
||||
failures.append(
|
||||
f"{path}: {bad_country.height:,} rows have ctry25cd != 'E92000001' "
|
||||
f"(non-England contamination); sample: {_format_samples(sample)}"
|
||||
)
|
||||
|
||||
for col in pct_cols:
|
||||
out_of_range = df.filter(
|
||||
pl.col(col).is_not_null() & ~pl.col(col).is_between(0.0, 100.0)
|
||||
).height
|
||||
if out_of_range:
|
||||
failures.append(
|
||||
f"{path}: {col!r} has {out_of_range:,} values outside [0, 100]"
|
||||
)
|
||||
|
||||
return failures
|
||||
|
||||
|
||||
def _failures_for_properties_subset(spec: str) -> list[str]:
|
||||
"""Validate that every properties Postcode exists in the postcode feature
|
||||
table (no orphan properties) and that numeric price columns are positive."""
|
||||
properties_path, postcode_path = _split_pair(spec, "properties subset")
|
||||
failures = _failures_for_parquet(properties_path) + _failures_for_parquet(
|
||||
postcode_path
|
||||
)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
postcode_set = _parquet_postcodes(postcode_path)
|
||||
property_set = _parquet_postcodes(properties_path)
|
||||
except Exception as exc:
|
||||
return [f"{properties_path} / {postcode_path}: subset check failed: {exc}"]
|
||||
|
||||
orphans = property_set - postcode_set
|
||||
if orphans:
|
||||
failures.append(
|
||||
f"{properties_path}: {len(orphans):,} property postcodes are absent from "
|
||||
f"{postcode_path}; sample: {_sample(orphans)}"
|
||||
)
|
||||
|
||||
# Positivity check for genuine numeric price columns only (skip nested/list
|
||||
# columns like historical_prices, which contain "price" in the name).
|
||||
try:
|
||||
schema = pl.scan_parquet(properties_path).collect_schema()
|
||||
numeric = {
|
||||
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
||||
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
||||
pl.Float32, pl.Float64,
|
||||
}
|
||||
price_cols = [
|
||||
c
|
||||
for c, dtype in schema.items()
|
||||
if ("price" in c.lower() or "rent" in c.lower()) and dtype in numeric
|
||||
]
|
||||
for col in price_cols:
|
||||
bad = (
|
||||
pl.scan_parquet(properties_path)
|
||||
.filter(pl.col(col).is_not_null() & (pl.col(col) <= 0))
|
||||
.select(pl.len())
|
||||
.collect()
|
||||
.item()
|
||||
)
|
||||
if bad:
|
||||
failures.append(
|
||||
f"{properties_path}: {col!r} has {bad:,} non-positive values"
|
||||
)
|
||||
except Exception as exc:
|
||||
failures.append(f"{properties_path}: price positivity check failed: {exc}")
|
||||
|
||||
return failures
|
||||
|
||||
|
||||
def _failures_for_price_index(path: Path) -> list[str]:
|
||||
"""Validate price_index.parquet structural integrity: required columns, a
|
||||
finite non-null log_index, and unique (sector, type_group, year) keys.
|
||||
|
||||
n_pairs == 0 is intentionally NOT treated as a failure: those rows are
|
||||
legitimate hedonic/shrinkage fallbacks for sectors with too few repeat-sale
|
||||
pairs.
|
||||
"""
|
||||
failures = _failures_for_parquet(path)
|
||||
if failures:
|
||||
return failures
|
||||
|
||||
try:
|
||||
names = pl.scan_parquet(path).collect_schema().names()
|
||||
required = {"sector", "type_group", "year", "log_index", "n_pairs"}
|
||||
missing = sorted(required - set(names))
|
||||
if missing:
|
||||
return [f"{path}: price index missing required columns: {missing}"]
|
||||
|
||||
stats = (
|
||||
pl.scan_parquet(path)
|
||||
.select(
|
||||
pl.len().alias("n"),
|
||||
pl.col("log_index").null_count().alias("null_log"),
|
||||
(~pl.col("log_index").is_finite()).sum().alias("nonfinite_log"),
|
||||
pl.struct("sector", "type_group", "year").n_unique().alias("unique_keys"),
|
||||
)
|
||||
.collect()
|
||||
.row(0, named=True)
|
||||
)
|
||||
except Exception as exc:
|
||||
return [f"{path}: price index validation failed: {exc}"]
|
||||
|
||||
if stats["null_log"]:
|
||||
failures.append(f"{path}: {stats['null_log']:,} rows have null log_index")
|
||||
if stats["nonfinite_log"]:
|
||||
failures.append(
|
||||
f"{path}: {stats['nonfinite_log']:,} rows have non-finite log_index"
|
||||
)
|
||||
if stats["unique_keys"] != stats["n"]:
|
||||
failures.append(
|
||||
f"{path}: (sector, type_group, year) is not unique "
|
||||
f"({stats['n'] - stats['unique_keys']:,} duplicate rows)"
|
||||
)
|
||||
|
||||
return failures
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--file", action="append", default=[], type=Path)
|
||||
|
|
@ -385,6 +555,29 @@ def main() -> int:
|
|||
"GeoJSON postcodes: ARCGIS_PARQUET::DIR"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcode-features",
|
||||
action="append",
|
||||
default=[],
|
||||
type=Path,
|
||||
help=(
|
||||
"Validate a postcode feature parquet: unique Postcode, non-null "
|
||||
"lat/lon in England, ctry25cd=E92000001, '% ' columns in [0,100]"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--properties-subset",
|
||||
action="append",
|
||||
default=[],
|
||||
help="Require properties postcodes to be a subset of postcode keys: PROPERTIES::POSTCODE",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--price-index",
|
||||
action="append",
|
||||
default=[],
|
||||
type=Path,
|
||||
help="Validate price_index.parquet: finite log_index and unique (sector,type_group,year)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
failures: list[str] = []
|
||||
|
|
@ -404,6 +597,12 @@ def main() -> int:
|
|||
failures.extend(_failures_for_postcode_boundary_match(spec))
|
||||
for spec in args.active_postcode_boundary_match:
|
||||
failures.extend(_failures_for_active_postcode_boundary_match(spec))
|
||||
for path in args.postcode_features:
|
||||
failures.extend(_failures_for_postcode_features(path))
|
||||
for spec in args.properties_subset:
|
||||
failures.extend(_failures_for_properties_subset(spec))
|
||||
for path in args.price_index:
|
||||
failures.extend(_failures_for_price_index(path))
|
||||
|
||||
if failures:
|
||||
print("Output validation failed:", file=sys.stderr)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue