Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -17,7 +17,11 @@ from shapely.strtree import STRtree
from thefuzz import fuzz
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
from pipeline.transform.price_estimation.knn import (
MAX_COMPARABLE_PSM,
MIN_COMPARABLE_PSM,
)
from pipeline.utils.fuzzy_join import (
normalize_address_key,
normalize_postcode_key,
@ -59,7 +63,7 @@ _AREA_COLUMNS = [
"Air Quality and Road Safety Score",
# Ethnicity
"% South Asian",
"% East Asian",
"% East/SE Asian",
"% Black",
"% Mixed",
"% White",
@ -1060,14 +1064,10 @@ def _canonical_epc_property_type_expr() -> pl.Expr:
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return (
pl.col(column)
.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
# implausible years -> null). Already-numeric inputs pass through unchanged.
return epc_band_to_year(pl.col(column))
def _address_score(query: str, candidate: str | None) -> int:
@ -1956,7 +1956,9 @@ def _build(
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
# it sorts/filters correctly; null (not a fabricated 10) when no availability
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
broadband = (
pl.scan_parquet(broadband_path)
.select(
@ -1969,13 +1971,12 @@ def _build(
.then(100)
.when(pl.col("SFBB availability (% premises)") > 0)
.then(30)
.otherwise(10)
.otherwise(None)
.cast(pl.UInt16)
.alias("max_download_speed"),
)
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
)
area_side_tables = {
"iod": iod,
@ -2052,9 +2053,20 @@ def _build(
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
).with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
# Null out implausible per-sqm values (outside the kNN comparable band):
# bulk/block transactions divided by a single unit's floor area otherwise
# produce figures up to ~£1.5M/sqm.
pl.when(
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
& (
(pl.col("latest_price") / pl.col("total_floor_area"))
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
)
)
.then(
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
)
.otherwise(None)
.alias("Price per sqm"),
)
wide = _finalize_merged_columns(wide)