Test changes
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 8m20s
CI / Check (push) Failing after 10m40s

This commit is contained in:
Andras Schmelczer 2026-05-09 11:35:38 +01:00
parent 4c95815dc8
commit be02fc16bb
41 changed files with 4224 additions and 759 deletions

View file

@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
_AREA_COLUMNS = [
"Postcode",
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
"Number of parks within 1km",
"Distance to nearest train or tube station (km)",
"Distance to nearest park (km)",
"Distance to nearest grocery store (km)",
"Distance to nearest tube station (km)",
"Distance to nearest rail station (km)",
"Distance to nearest Waitrose (km)",
"Distance to nearest Tesco (km)",
"Distance to nearest cafe (km)",
"Distance to nearest pub (km)",
"Distance to nearest restaurant (km)",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
]
def _is_dynamic_poi_metric_column(column: str) -> bool:
return (
column.startswith("Distance to nearest ")
and column.endswith(" POI (km)")
) or (
column.startswith("Number of ")
and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
)
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
descending_rank = pl.col(column).rank("average", descending=True)
return (
pl.when(pl.col(column).is_null())
.then(None)
.when(pl.col(column) == pl.col(column).min())
.then(100.0)
.when(pl.col(column) == pl.col(column).max())
.then(0.0)
.when(non_null_count > 1)
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.alias(column)
)
def _build(
epc_pp_path: Path,
arcgis_path: Path,
@ -134,20 +179,11 @@ def _build(
)
wide = wide.join(arcgis, on="postcode", how="left")
iod = pl.scan_parquet(iod_path)
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Invert deprivation scores so that higher values = less deprived (better)
iod_score_cols = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
@ -351,6 +387,14 @@ def _build(
"parks_1km": "Number of parks within 1km",
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
"parks_nearest_km": "Distance to nearest park (km)",
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
"tube_station_nearest_km": "Distance to nearest tube station (km)",
"rail_station_nearest_km": "Distance to nearest rail station (km)",
"waitrose_nearest_km": "Distance to nearest Waitrose (km)",
"tesco_nearest_km": "Distance to nearest Tesco (km)",
"cafe_nearest_km": "Distance to nearest cafe (km)",
"pub_nearest_km": "Distance to nearest pub (km)",
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
@ -381,10 +425,14 @@ def _build(
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
area_cols.extend(
c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
)
area_col_set = set(area_cols)
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")