Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -28,6 +28,17 @@ MINOR_CRIME_TYPES = (
|
|||
"Other crime",
|
||||
)
|
||||
|
||||
# Legacy police.uk crime-type names (pre-2014 taxonomy) mapped to their closest
|
||||
# current equivalent. Without this, ~1.9M incidents from 2010-2013 ("Violent
|
||||
# crime", "Public disorder and weapons") are unrecognised and silently dropped,
|
||||
# which understates pre-2013 serious crime and creates an artificial 2012->2013
|
||||
# step in the by-year series. Applied with `.replace` (not `.replace_strict`) so
|
||||
# unmapped current types pass through unchanged.
|
||||
LEGACY_CRIME_TYPE_ALIASES = {
|
||||
"Violent crime": "Violence and sexual offences",
|
||||
"Public disorder and weapons": "Public order",
|
||||
}
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
|
|
@ -96,6 +107,7 @@ def transform_crime(
|
|||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
)
|
||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
.group_by("LSOA code", "Month", "Crime type")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
|
|
@ -147,7 +159,10 @@ def _write_crime_by_year(
|
|||
& (pl.col("LSOA code") != "")
|
||||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
|
||||
).with_columns(
|
||||
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
|
||||
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
|
||||
)
|
||||
|
||||
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
|
||||
# Using crime-type-specific months would over-scale years where a rare type appears
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ from pathlib import Path
|
|||
import polars as pl
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.crime import find_street_crime_csvs
|
||||
from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs
|
||||
|
||||
|
||||
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
|
||||
|
|
@ -80,6 +80,10 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
|
|||
.drop_nulls(["lon", "lat"])
|
||||
.filter(pl.col("lon").is_between(-9.5, 5.0))
|
||||
.filter(pl.col("lat").is_between(49.0, 57.0))
|
||||
# Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
|
||||
# values always match the frontend's canonical filter list (a no-op for
|
||||
# the recent months this overlay normally covers).
|
||||
.with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
.group_by("lon", "lat", "month", "crime_type")
|
||||
.len()
|
||||
.rename({"len": "count"})
|
||||
|
|
|
|||
|
|
@ -44,6 +44,7 @@ import shapely
|
|||
from pyproj import Transformer
|
||||
|
||||
from pipeline.transform.crime import (
|
||||
LEGACY_CRIME_TYPE_ALIASES,
|
||||
MINOR_CRIME_TYPES,
|
||||
SERIOUS_CRIME_TYPES,
|
||||
find_street_crime_csvs,
|
||||
|
|
@ -150,6 +151,11 @@ def _accumulate_counts(
|
|||
& (pl.col("Crime type") != "")
|
||||
& pl.col("year").is_in(years)
|
||||
)
|
||||
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
|
||||
# "Public disorder and weapons") to their current equivalents before
|
||||
# indexing, so ~1.9M historical incidents are counted instead of
|
||||
# dropped. `.replace` leaves current types unchanged.
|
||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
# Map crime types to indices with default=None so an unrecognised
|
||||
# type yields a null index we can *report* rather than silently drop
|
||||
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).
|
||||
|
|
|
|||
|
|
@ -18,11 +18,49 @@ from ..utils import (
|
|||
normalize_postcode_key,
|
||||
)
|
||||
|
||||
|
||||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
MIN_PRICE = 50_000
|
||||
|
||||
# Plausible construction-year range; band-derived years outside it (e.g. OCR
|
||||
# noise like 1012 or 2202) are nulled rather than published.
|
||||
MIN_BUILD_YEAR = 1700
|
||||
MAX_BUILD_YEAR = 2030
|
||||
|
||||
|
||||
def epc_band_to_year(band: pl.Expr) -> pl.Expr:
|
||||
"""Map an EPC construction age band to a single representative build year.
|
||||
|
||||
EPC age bands are ranges (e.g. ``1950-1966``); we use the band MIDPOINT
|
||||
(1958) rather than the lower bound, which previously biased every band-derived
|
||||
year ~10-15 years too young. Open-ended lower bands (``before 1900``) are too
|
||||
wide to pin to a year and return null. Single-year / ``... onwards`` bands use
|
||||
that year. Already-numeric inputs (a year produced by an earlier call) pass
|
||||
through unchanged. Years outside [MIN_BUILD_YEAR, MAX_BUILD_YEAR] are nulled.
|
||||
"""
|
||||
text = (
|
||||
band.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
)
|
||||
low = text.str.extract(r"(\d{4})", 1).cast(pl.Int32, strict=False)
|
||||
high = text.str.extract(r"(\d{4})\D+(\d{4})", 2).cast(pl.Int32, strict=False)
|
||||
year = (
|
||||
pl.when(text.str.starts_with("before "))
|
||||
.then(None)
|
||||
.when(high.is_not_null())
|
||||
.then(((low + high) / 2).round(0).cast(pl.Int32))
|
||||
.otherwise(low)
|
||||
)
|
||||
return (
|
||||
pl.when((year >= MIN_BUILD_YEAR) & (year <= MAX_BUILD_YEAR))
|
||||
.then(year)
|
||||
.otherwise(None)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
||||
|
||||
EPC_SOURCE_COLUMNS = [
|
||||
"address",
|
||||
"postcode",
|
||||
|
|
@ -410,13 +448,7 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
|
||||
# For new-builds (old_new == "Y"), use the first transaction date year as
|
||||
# the exact construction date; otherwise fall back to the EPC age band.
|
||||
epc_band_year = (
|
||||
pl.col("construction_age_band")
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
epc_band_year = epc_band_to_year(pl.col("construction_age_band"))
|
||||
transfer_year = (
|
||||
pl.col("first_transfer_date").dt.year().cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,11 @@ from shapely.strtree import STRtree
|
|||
from thefuzz import fuzz
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
MAX_COMPARABLE_PSM,
|
||||
MIN_COMPARABLE_PSM,
|
||||
)
|
||||
from pipeline.utils.fuzzy_join import (
|
||||
normalize_address_key,
|
||||
normalize_postcode_key,
|
||||
|
|
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
|
|||
"Air Quality and Road Safety Score",
|
||||
# Ethnicity
|
||||
"% South Asian",
|
||||
"% East Asian",
|
||||
"% East/SE Asian",
|
||||
"% Black",
|
||||
"% Mixed",
|
||||
"% White",
|
||||
|
|
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:
|
|||
|
||||
|
||||
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
||||
return (
|
||||
pl.col(column)
|
||||
.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
|
||||
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
|
||||
# implausible years -> null). Already-numeric inputs pass through unchanged.
|
||||
return epc_band_to_year(pl.col(column))
|
||||
|
||||
|
||||
def _address_score(query: str, candidate: str | None) -> int:
|
||||
|
|
@ -1956,7 +1956,9 @@ def _build(
|
|||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
|
||||
# it sorts/filters correctly; null (not a fabricated 10) when no availability
|
||||
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
|
||||
broadband = (
|
||||
pl.scan_parquet(broadband_path)
|
||||
.select(
|
||||
|
|
@ -1969,13 +1971,12 @@ def _build(
|
|||
.then(100)
|
||||
.when(pl.col("SFBB availability (% premises)") > 0)
|
||||
.then(30)
|
||||
.otherwise(10)
|
||||
.otherwise(None)
|
||||
.cast(pl.UInt16)
|
||||
.alias("max_download_speed"),
|
||||
)
|
||||
.group_by("bb_postcode")
|
||||
.agg(pl.col("max_download_speed").max())
|
||||
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
|
||||
)
|
||||
area_side_tables = {
|
||||
"iod": iod,
|
||||
|
|
@ -2052,9 +2053,20 @@ def _build(
|
|||
.otherwise(pl.col("current_energy_rating"))
|
||||
.alias("current_energy_rating"),
|
||||
).with_columns(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
# Null out implausible per-sqm values (outside the kNN comparable band):
|
||||
# bulk/block transactions divided by a single unit's floor area otherwise
|
||||
# produce figures up to ~£1.5M/sqm.
|
||||
pl.when(
|
||||
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
& (
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
|
||||
)
|
||||
)
|
||||
.then(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
|
||||
)
|
||||
.otherwise(None)
|
||||
.alias("Price per sqm"),
|
||||
)
|
||||
wide = _finalize_merged_columns(wide)
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from pathlib import Path
|
|||
|
||||
from pyproj import Transformer
|
||||
from shapely import make_valid, set_precision
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
from shapely.ops import unary_union
|
||||
|
|
@ -43,7 +44,14 @@ def _largest_polygonal(geom) -> Polygon | None:
|
|||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
|
||||
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
||||
just the intermediate Shapely object: coordinate snapping during
|
||||
serialization can otherwise leave a self-intersecting ring that only shows up
|
||||
once the feature is read back from disk. Any such geometry is repaired with
|
||||
``make_valid`` before returning so written features are always valid.
|
||||
"""
|
||||
geom = _largest_polygonal(geom)
|
||||
if geom is None:
|
||||
return None
|
||||
|
|
@ -55,12 +63,28 @@ def to_wgs84_geojson(
|
|||
|
||||
transformer = _get_to_wgs84()
|
||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
try:
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
except GEOSException:
|
||||
# Precision snapping can fail on pathological geometries; fall back to a
|
||||
# plain validity repair without coordinate snapping.
|
||||
wgs84 = make_valid(wgs84)
|
||||
wgs84 = _largest_polygonal(wgs84)
|
||||
if wgs84 is None:
|
||||
return None
|
||||
|
||||
return mapping(wgs84)
|
||||
geojson_dict = mapping(wgs84)
|
||||
|
||||
# The geometry that actually reaches disk is the GeoJSON dict, so validate
|
||||
# *that* (not the pre-serialization object) and repair if needed.
|
||||
round_trip = shape(geojson_dict)
|
||||
if round_trip.is_empty or not round_trip.is_valid:
|
||||
round_trip = _largest_polygonal(make_valid(round_trip))
|
||||
if round_trip is None or round_trip.is_empty:
|
||||
return None
|
||||
geojson_dict = mapping(round_trip)
|
||||
|
||||
return geojson_dict
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
|
|
@ -119,7 +143,11 @@ def merge_fragments(
|
|||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _fill_holes(combined)
|
||||
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
||||
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
||||
# Filling them would re-add the removed area and negate the
|
||||
# subtraction. Artifact holes from the INSPIRE+Voronoi+make_valid
|
||||
# chain were already removed by the _fill_holes above (pre-subtraction).
|
||||
# Revert if subtraction + fragment selection lost >90% of area
|
||||
if pre_green.area > 0 and combined.area / pre_green.area < 0.1:
|
||||
combined = pre_green
|
||||
|
|
|
|||
|
|
@ -893,3 +893,54 @@ class TestSubtractGreenspace:
|
|||
result = subtract_greenspace(postcode, tree, geoms)
|
||||
# 80% < 90% cap, so subtraction should happen
|
||||
assert result.area == pytest.approx(2000, rel=0.01)
|
||||
|
||||
|
||||
class TestToWgs84GeojsonValidity:
|
||||
"""to_wgs84_geojson must emit GeoJSON that round-trips to a valid geometry."""
|
||||
|
||||
def test_geojson_round_trips_to_valid_geometry(self):
|
||||
from shapely.geometry import shape
|
||||
|
||||
geojson = to_wgs84_geojson(box(530000, 180000, 530100, 180100))
|
||||
assert geojson is not None
|
||||
rt = shape(geojson)
|
||||
assert not rt.is_empty
|
||||
assert rt.is_valid
|
||||
|
||||
def test_written_district_features_are_all_valid(self, tmp_path):
|
||||
from shapely.geometry import shape
|
||||
|
||||
postcodes = {
|
||||
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||
"AA1 1AB": MultiPolygon(
|
||||
[
|
||||
box(530200, 180000, 530250, 180050),
|
||||
box(530200, 180060, 530250, 180110),
|
||||
]
|
||||
),
|
||||
}
|
||||
assert write_district_geojson(postcodes, tmp_path) == 1
|
||||
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
for feature in collection["features"]:
|
||||
geom = shape(feature["geometry"])
|
||||
assert geom.is_valid
|
||||
assert not geom.is_empty
|
||||
|
||||
|
||||
class TestGreenspaceHolePreserved:
|
||||
"""Interior holes carved by greenspace subtraction must survive merge_fragments
|
||||
(the post-subtraction _fill_holes that previously negated them was removed)."""
|
||||
|
||||
def test_interior_lake_hole_survives_merge_fragments(self):
|
||||
from shapely.strtree import STRtree
|
||||
|
||||
postcode = box(0, 0, 100, 100) # 10000 sqm
|
||||
lake = box(30, 30, 70, 70) # 1600 sqm fully-interior hole (16% removal)
|
||||
result = merge_fragments(
|
||||
[("TEST1", postcode)],
|
||||
greenspace_tree=STRtree([lake]),
|
||||
greenspace_geoms=[lake],
|
||||
)
|
||||
merged = result["TEST1"]
|
||||
assert len(list(merged.interiors)) == 1
|
||||
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
|
||||
|
|
|
|||
|
|
@ -67,6 +67,16 @@ def extract_test_set(input_path: Path) -> pl.DataFrame:
|
|||
.struct.field("price")
|
||||
.alias("input_price"),
|
||||
)
|
||||
.with_columns(
|
||||
# Date of the input (second-to-last) sale, used by the kNN leakage
|
||||
# filter to exclude the target property's own prior sale from its
|
||||
# comparables. Built from year+month (day defaults to the 1st).
|
||||
pl.date(
|
||||
pl.col("input_year").cast(pl.Int32),
|
||||
pl.col("input_month").cast(pl.Int32),
|
||||
1,
|
||||
).alias("input_date"),
|
||||
)
|
||||
.with_columns(
|
||||
(
|
||||
pl.col("actual_year").cast(pl.Float64)
|
||||
|
|
|
|||
|
|
@ -18,6 +18,8 @@ import polars as pl
|
|||
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
MAX_COMPARABLE_PSM,
|
||||
MIN_COMPARABLE_PSM,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
|
|
@ -31,7 +33,13 @@ from pipeline.transform.price_estimation.utils import (
|
|||
|
||||
MAX_KNN_TO_INDEX_RATIO = 2.0
|
||||
MIN_KNN_TO_INDEX_RATIO = 0.5
|
||||
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 6.0
|
||||
# Cap the final estimate at this multiple of the last known price as a guard
|
||||
# against data errors. Set to ~exp(MAX_LOG_ADJUSTMENT) (~20x) so it is
|
||||
# consistent with the log-index clip already applied to the index move: many
|
||||
# UK sectors legitimately grew >6x since the 1990s (e.g. parts of inner London
|
||||
# 12-14x), so the previous 6x cap truncated genuine appreciation rather than
|
||||
# only catching outliers.
|
||||
MAX_ESTIMATE_TO_LAST_PRICE_RATIO = 20.0
|
||||
|
||||
|
||||
def guarded_blend_estimates(
|
||||
|
|
@ -222,11 +230,22 @@ def main():
|
|||
).height
|
||||
print(f" kNN blended: {n_blended:,} of {n_estimated:,} estimates")
|
||||
|
||||
# Derive estimated price per sqm where both estimated price and floor area exist
|
||||
# Derive estimated price per sqm where both estimated price and floor area
|
||||
# exist. Null out values outside the plausibility band [MIN_COMPARABLE_PSM,
|
||||
# MAX_COMPARABLE_PSM] (the same band the kNN pool uses): extreme values come
|
||||
# from bulk/block transactions or floor-area errors and are not meaningful
|
||||
# per-unit prices.
|
||||
_est_psm = pl.col("Estimated current price") / pl.col("Total floor area (sqm)")
|
||||
df = df.with_columns(
|
||||
(pl.col("Estimated current price") / pl.col("Total floor area (sqm)"))
|
||||
.round(0)
|
||||
.cast(pl.Int32, strict=False)
|
||||
pl.when(
|
||||
pl.col("Estimated current price").is_not_null()
|
||||
& pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
& (_est_psm >= MIN_COMPARABLE_PSM)
|
||||
& (_est_psm <= MAX_COMPARABLE_PSM)
|
||||
)
|
||||
.then(_est_psm.round(0).cast(pl.Int32, strict=False))
|
||||
.otherwise(None)
|
||||
.alias("Est. price per sqm"),
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ from pipeline.transform.price_estimation.shrinkage import (
|
|||
)
|
||||
from pipeline.transform.price_estimation.utils import (
|
||||
CURRENT_YEAR,
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA,
|
||||
TYPE_GROUPS,
|
||||
build_hedonic_features,
|
||||
extract_centroids,
|
||||
|
|
@ -165,12 +166,50 @@ def solve_robust_index(
|
|||
cols_arr = np.concatenate([col2[mask2], col1[mask1]])
|
||||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
# Temporal smoothness prior: penalise curvature in the year betas with a
|
||||
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
|
||||
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
|
||||
# zero target). This damps single-year index spikes without flattening
|
||||
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
|
||||
# year (min_year, implicit beta=0) has no column, so the penalty spans the
|
||||
# non-baseline years only. For cells with <3 betas there is no curvature to
|
||||
# penalise and the solve is unchanged.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cols_by_year = [c for _, c in sorted(year_to_col.items())]
|
||||
n_pen = n_cols - 2
|
||||
pen_rows = np.repeat(n + np.arange(n_pen), 3)
|
||||
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
|
||||
for k in range(n_pen):
|
||||
pen_cols[3 * k : 3 * k + 3] = (
|
||||
cols_by_year[k],
|
||||
cols_by_year[k + 1],
|
||||
cols_by_year[k + 2],
|
||||
)
|
||||
pen_rows_arr = pen_rows.astype(np.int64)
|
||||
pen_cols_arr = pen_cols
|
||||
pen_vals_arr = np.tile(
|
||||
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
|
||||
).astype(np.float64)
|
||||
pen_b = np.zeros(n_pen, dtype=np.float64)
|
||||
n_total_rows = n + n_pen
|
||||
|
||||
weights = base_weights.copy()
|
||||
|
||||
for _ in range(IRLS_ITERATIONS):
|
||||
data = signs_arr * weights[rows_arr]
|
||||
A = csc_matrix((data, (rows_arr, cols_arr)), shape=(n, n_cols))
|
||||
b = log_ratios * weights
|
||||
if n_pen:
|
||||
all_data = np.concatenate([data, pen_vals_arr])
|
||||
all_rows = np.concatenate([rows_arr, pen_rows_arr])
|
||||
all_cols = np.concatenate([cols_arr, pen_cols_arr])
|
||||
b = np.concatenate([log_ratios * weights, pen_b])
|
||||
else:
|
||||
all_data, all_rows, all_cols = data, rows_arr, cols_arr
|
||||
b = log_ratios * weights
|
||||
A = csc_matrix((all_data, (all_rows, all_cols)), shape=(n_total_rows, n_cols))
|
||||
betas = lsqr(A, b, atol=1e-10, btol=1e-10)[0]
|
||||
|
||||
# Residuals
|
||||
|
|
|
|||
|
|
@ -96,8 +96,11 @@ def spatial_smooth(
|
|||
for i, sec in enumerate(sectors_with_coords):
|
||||
n = counts.get(sec, 0)
|
||||
self_w = n / (n + SPATIAL_BLEND_K)
|
||||
if self_w > 0.95:
|
||||
continue # enough data, skip smoothing
|
||||
if self_w > 0.90:
|
||||
# Enough data, skip smoothing. Relaxed from 0.95 so higher-volume
|
||||
# cells (n ~270-570) that still carry single-year noise get a light
|
||||
# spatial blend, complementing the temporal smoothness prior.
|
||||
continue
|
||||
|
||||
dists, idxs = tree.query(scaled_coords[i], k=SPATIAL_NEIGHBORS + 1)
|
||||
# Skip self (index 0, distance ~0)
|
||||
|
|
|
|||
|
|
@ -81,8 +81,21 @@ def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
|
|||
last_prices=np.array([100_000.0, 100_000.0]),
|
||||
)
|
||||
|
||||
# Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
|
||||
assert blended[0] == 120_000.0
|
||||
assert blended[1] == 600_000.0
|
||||
# Property 1: a 10x uplift over the last price is legitimate appreciation and
|
||||
# is no longer truncated (cap raised from 6x to 20x).
|
||||
assert blended[1] == 1_000_000.0
|
||||
|
||||
|
||||
def test_guarded_blend_caps_uplift_at_20x_last_price():
|
||||
# 50x index estimate over the last price is capped at the 20x ceiling.
|
||||
blended = guarded_blend_estimates(
|
||||
index_est=np.array([5_000_000.0]),
|
||||
knn_est=np.array([np.nan]),
|
||||
last_prices=np.array([100_000.0]),
|
||||
)
|
||||
assert blended[0] == 2_000_000.0 # 100_000 * 20
|
||||
|
||||
|
||||
def test_bungalow_is_not_a_dead_price_index_type_group():
|
||||
|
|
@ -92,3 +105,50 @@ def test_bungalow_is_not_a_dead_price_index_type_group():
|
|||
|
||||
assert "Bungalow" not in TYPE_GROUPS
|
||||
assert df["type_group"].to_list() == [None, None]
|
||||
|
||||
|
||||
def test_temporal_regularization_damps_curvature_without_breaking_solve():
|
||||
"""The second-difference prior reduces year-to-year curvature and keeps the
|
||||
index well-formed (all years present, finite, contiguous)."""
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
|
||||
years = np.arange(2010, 2021)
|
||||
true = {y: 0.04 * (y - 2010) for y in years}
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
for y in years[:-1]: # adjacent-year pairs following a smooth trend
|
||||
y1.append(y)
|
||||
y2.append(y + 1)
|
||||
lr.append(true[y + 1] - true[y])
|
||||
w.append(1.0)
|
||||
# A spurious single-year jump at 2015 (poorly identified curvature spike).
|
||||
y1.append(2014)
|
||||
y2.append(2015)
|
||||
lr.append(0.5)
|
||||
w.append(1.0)
|
||||
y1, y2 = np.array(y1), np.array(y2)
|
||||
lr, w = np.array(lr, float), np.array(w, float)
|
||||
|
||||
def solve(lmbda):
|
||||
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
|
||||
try:
|
||||
return index_mod.solve_robust_index(y1, y2, lr, w)
|
||||
finally:
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
|
||||
|
||||
unregularised = solve(0.0)
|
||||
regularised = solve(0.2)
|
||||
|
||||
# Index is well-formed for both.
|
||||
assert set(regularised) == set(range(2010, 2021))
|
||||
assert all(np.isfinite(v) for v in regularised.values())
|
||||
assert regularised[2010] == 0.0 # baseline year pinned to 0
|
||||
|
||||
def max_curvature(d):
|
||||
betas = np.array([d[y] for y in sorted(d)])
|
||||
return float(np.abs(np.diff(betas, 2)).max())
|
||||
|
||||
# Regularisation strictly reduces curvature, and never flattens the genuine
|
||||
# uptrend (the index still rises end to end).
|
||||
assert max_curvature(regularised) < max_curvature(unregularised)
|
||||
assert regularised[2020] > regularised[2010]
|
||||
|
|
|
|||
|
|
@ -22,6 +22,13 @@ FLAT_TYPES = ["Flats/Maisonettes"]
|
|||
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
|
||||
SHRINKAGE_K = 50
|
||||
|
||||
# Temporal regularization for the repeat-sales index: a second-difference
|
||||
# (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added
|
||||
# to the IRLS solve. A mild penalty damps single-year index spikes (which would
|
||||
# otherwise distort the estimate of any property whose last sale landed on a
|
||||
# noisy year) without flattening genuine multi-year trends.
|
||||
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05
|
||||
|
||||
|
||||
def type_group_expr():
|
||||
"""Polars expression: Property type -> type_group."""
|
||||
|
|
|
|||
|
|
@ -15,6 +15,66 @@ SCHOOL_GROUPS = {
|
|||
}
|
||||
|
||||
|
||||
def classify_good_plus_schools(ofsted: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Label good+/outstanding primary & secondary schools for proximity counts.
|
||||
|
||||
Derives a grade ("1" = outstanding, "2" = good) and a proximity ``category``,
|
||||
returning a ``(postcode, category)`` frame.
|
||||
|
||||
Schools with a recent GRADED inspection carry a 1-4 grade in "Latest OEIF
|
||||
overall effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
Framework). A large and growing share of schools were last inspected under an
|
||||
UNGRADED (Section 8) inspection or the post-2024 report-card framework, so
|
||||
that column is null/"Not judged" for them even when they are demonstrably
|
||||
good — their status lives in "Ungraded inspection overall outcome" ("School
|
||||
remains Good"/"School remains Outstanding", incl. "(Concerns)"/"(Improving)"
|
||||
variants). Filtering on the graded column alone dropped ~7,000 genuinely
|
||||
good/outstanding schools. We fall back to the ungraded outcome, but ONLY when
|
||||
there is no usable graded result (null/"Not judged"), so a genuine grade 3/4
|
||||
is never overridden.
|
||||
"""
|
||||
# Cast to Utf8 so the string predicates below are well-defined even if a
|
||||
# column happens to be entirely null (read back as a Null dtype).
|
||||
oeif = pl.col("Latest OEIF overall effectiveness").cast(pl.Utf8, strict=False)
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
no_usable_grade = oeif.is_null() | (oeif == "Not judged")
|
||||
graded = (
|
||||
ofsted.filter(pl.col("Ofsted phase").is_in(["Primary", "Secondary"]))
|
||||
.with_columns(
|
||||
pl.when(oeif.is_in(["1", "2"]))
|
||||
.then(oeif)
|
||||
.when(
|
||||
no_usable_grade
|
||||
& ungraded.str.starts_with("School remains Outstanding")
|
||||
)
|
||||
.then(pl.lit("1"))
|
||||
.when(no_usable_grade & ungraded.str.starts_with("School remains Good"))
|
||||
.then(pl.lit("2"))
|
||||
.otherwise(None)
|
||||
.alias("_ofsted_grade")
|
||||
)
|
||||
.filter(pl.col("_ofsted_grade").is_not_null())
|
||||
)
|
||||
# Good+ groups include both grade variants; outstanding groups count grade 1.
|
||||
return graded.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("_ofsted_grade") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
.alias("category")
|
||||
).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Count good+ and outstanding primary/secondary schools near each postcode"
|
||||
|
|
@ -30,42 +90,14 @@ def main():
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load Ofsted data: filter to good+ (1, 2) primary/secondary schools.
|
||||
# Post-2025 reform the single "Overall effectiveness" grade was retired;
|
||||
# the legacy 1–4 scale is now carried forward under "Latest OEIF overall
|
||||
# effectiveness" (OEIF = the previous Ofsted Education Inspection
|
||||
# Framework). The new report-card columns use text judgements instead.
|
||||
ofsted = pl.read_parquet(args.ofsted).filter(
|
||||
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
|
||||
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
|
||||
)
|
||||
ofsted = classify_good_plus_schools(pl.read_parquet(args.ofsted))
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
print(f"Good+ schools: {len(ofsted):,}")
|
||||
print(
|
||||
"Outstanding schools: "
|
||||
f"{ofsted.filter(pl.col('Latest OEIF overall effectiveness') == '1').height:,}"
|
||||
)
|
||||
|
||||
# Assign category based on phase and rating. Good+ groups include both
|
||||
# category variants; outstanding groups count grade 1 only.
|
||||
ofsted = ofsted.with_columns(
|
||||
pl.when(pl.col("Ofsted phase") == "Primary")
|
||||
.then(
|
||||
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
|
||||
.then(pl.lit("outstanding_primary"))
|
||||
.otherwise(pl.lit("good_primary"))
|
||||
)
|
||||
.otherwise(
|
||||
pl.when(pl.col("Latest OEIF overall effectiveness") == "1")
|
||||
.then(pl.lit("outstanding_secondary"))
|
||||
.otherwise(pl.lit("good_secondary"))
|
||||
)
|
||||
.alias("category")
|
||||
).select(
|
||||
pl.col("Postcode").alias("postcode"),
|
||||
"category",
|
||||
f"{ofsted.filter(pl.col('category').str.starts_with('outstanding')).height:,}"
|
||||
)
|
||||
|
||||
# Join with arcgis to get lat/lng for each school's postcode
|
||||
|
|
|
|||
|
|
@ -226,3 +226,44 @@ def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
|
|||
assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
|
||||
assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
|
||||
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
|
||||
|
||||
|
||||
def test_transform_crime_maps_legacy_crime_types(tmp_path):
|
||||
"""Pre-2014 police.uk type names are aliased to current equivalents instead
|
||||
of being dropped."""
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2013-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
(month_dir / "2013-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"1,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Violent crime,Under investigation,",
|
||||
"2,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Public disorder and weapons,Under investigation,",
|
||||
"3,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Burglary,Under investigation,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
by_year_output = tmp_path / "crime_by_year.parquet"
|
||||
transform_crime(crime_dir, output, by_year_output)
|
||||
|
||||
row = pl.read_parquet(output).to_dicts()[0]
|
||||
# Single month -> annualised x12. Legacy names mapped to current columns.
|
||||
assert row["Violence and sexual offences (avg/yr)"] == 12.0
|
||||
assert row["Public order (avg/yr)"] == 12.0
|
||||
assert row["Burglary (avg/yr)"] == 12.0
|
||||
# The legacy names must NOT survive as their own columns.
|
||||
assert "Violent crime (avg/yr)" not in row
|
||||
assert "Public disorder and weapons (avg/yr)" not in row
|
||||
|
||||
by_year = pl.read_parquet(by_year_output).row(0, named=True)
|
||||
serious = {p["year"]: p["count"] for p in by_year["Serious crime (by year)"]}
|
||||
# Serious = Violence and sexual offences (12) + Burglary (12) = 24
|
||||
assert serious[2013] == 24.0
|
||||
minor = {p["year"]: p["count"] for p in by_year["Minor crime (by year)"]}
|
||||
assert minor[2013] == 12.0 # Public order
|
||||
|
|
|
|||
|
|
@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
|||
err = capsys.readouterr().err
|
||||
assert "Cyber fraud" in err
|
||||
assert "WARNING" in err
|
||||
|
||||
|
||||
def test_legacy_crime_types_are_mapped(tmp_path):
|
||||
"""Pre-2014 crime-type names are aliased to current equivalents in the
|
||||
spatial transform instead of being dropped as unknown types."""
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(
|
||||
crime,
|
||||
"2013-01",
|
||||
[
|
||||
_crime_row("2013-01", 1005, 1005, "Violent crime"),
|
||||
_crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
|
||||
],
|
||||
)
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
row = pl.read_parquet(output).to_dicts()[0]
|
||||
# Single postcode -> area-norm factor 1.0; single month/year -> x12.
|
||||
assert row["Violence and sexual offences (avg/yr)"] == 12.0
|
||||
assert row["Public order (avg/yr)"] == 12.0
|
||||
|
||||
by_year_row = pl.read_parquet(by_year).row(0, named=True)
|
||||
assert by_year_row["Violence and sexual offences (by year)"] == [
|
||||
{"year": 2013, "count": 12.0}
|
||||
]
|
||||
assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]
|
||||
|
|
|
|||
|
|
@ -169,7 +169,8 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
"epc_address": "1 Example Street",
|
||||
"current_energy_rating": "C",
|
||||
"total_floor_area": 85.0,
|
||||
"construction_age_band": 1950,
|
||||
# Band midpoint of 1950-1966, not the lower bound.
|
||||
"construction_age_band": 1958,
|
||||
"was_council_house": "Yes",
|
||||
}
|
||||
]
|
||||
|
|
@ -256,3 +257,26 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
|||
"current_energy_rating": None,
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.join_epc_pp import epc_band_to_year
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"b": [
|
||||
"England and Wales: 1950-1966", # midpoint 1958
|
||||
"1900-1929", # midpoint 1914
|
||||
"England and Wales: before 1900", # too wide -> null
|
||||
"2012 onwards", # single year
|
||||
"1012", # implausible -> null
|
||||
"2202", # implausible -> null
|
||||
None, # null -> null
|
||||
"1958", # already-numeric-as-string -> pass through
|
||||
]
|
||||
}
|
||||
)
|
||||
years = df.select(epc_band_to_year(pl.col("b")).alias("y"))["y"].to_list()
|
||||
assert years == [1958, 1914, None, 2012, None, None, None, 1958]
|
||||
|
|
|
|||
82
pipeline/transform/test_school_proximity.py
Normal file
82
pipeline/transform/test_school_proximity.py
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.school_proximity import classify_good_plus_schools
|
||||
|
||||
|
||||
def _school(phase, oeif, ungraded, postcode="AA1 1AA"):
|
||||
return {
|
||||
"Postcode": postcode,
|
||||
"Ofsted phase": phase,
|
||||
"Latest OEIF overall effectiveness": oeif,
|
||||
"Ungraded inspection overall outcome": ungraded,
|
||||
}
|
||||
|
||||
|
||||
def _classify(rows):
|
||||
result = classify_good_plus_schools(pl.DataFrame(rows))
|
||||
return {(r["postcode"], r["category"]) for r in result.to_dicts()}
|
||||
|
||||
|
||||
def test_legacy_oeif_grades_1_and_2_are_kept():
|
||||
rows = [
|
||||
_school("Primary", "1", None, "AA1 1AA"),
|
||||
_school("Primary", "2", None, "AA1 1AB"),
|
||||
_school("Secondary", "1", None, "AA1 1AC"),
|
||||
_school("Secondary", "2", None, "AA1 1AD"),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "outstanding_primary"),
|
||||
("AA1 1AB", "good_primary"),
|
||||
("AA1 1AC", "outstanding_secondary"),
|
||||
("AA1 1AD", "good_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_grades_3_and_4_are_excluded():
|
||||
rows = [_school("Primary", "3", None), _school("Primary", "4", None)]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_ungraded_remains_good_is_recovered_when_no_graded_result():
|
||||
# Null and "Not judged" OEIF fall back to the ungraded outcome.
|
||||
rows = [
|
||||
_school("Primary", None, "School remains Good", "AA1 1AA"),
|
||||
_school("Secondary", "Not judged", "School remains Outstanding", "AA1 1AB"),
|
||||
# "(Concerns)"/"(Improving)" variants are still good+.
|
||||
_school("Primary", None, "School remains Good (Concerns) - S5 Next", "AA1 1AC"),
|
||||
_school(
|
||||
"Secondary",
|
||||
None,
|
||||
"School remains Outstanding (Concerns) - S5 Next",
|
||||
"AA1 1AD",
|
||||
),
|
||||
]
|
||||
assert _classify(rows) == {
|
||||
("AA1 1AA", "good_primary"),
|
||||
("AA1 1AB", "outstanding_secondary"),
|
||||
("AA1 1AC", "good_primary"),
|
||||
("AA1 1AD", "outstanding_secondary"),
|
||||
}
|
||||
|
||||
|
||||
def test_ungraded_non_good_outcomes_are_excluded():
|
||||
rows = [
|
||||
_school("Primary", None, "Some aspects not as strong"),
|
||||
_school("Primary", None, "Standards maintained"),
|
||||
_school("Primary", None, None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_genuine_grade_3_is_not_overridden_by_stale_remains_good():
|
||||
# A real grade 3 must not be promoted by an ungraded "remains Good".
|
||||
rows = [_school("Primary", "3", "School remains Good")]
|
||||
assert _classify(rows) == set()
|
||||
|
||||
|
||||
def test_non_primary_secondary_phases_excluded():
|
||||
rows = [
|
||||
_school("Nursery", "1", None),
|
||||
_school("Not applicable", "2", None),
|
||||
]
|
||||
assert _classify(rows) == set()
|
||||
Loading…
Add table
Add a link
Reference in a new issue