Test changes
This commit is contained in:
parent
4c95815dc8
commit
be02fc16bb
41 changed files with 4224 additions and 759 deletions
|
|
@ -7,6 +7,15 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
|
||||
_IOD_PERCENTILE_COLUMNS = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
|
||||
|
||||
_AREA_COLUMNS = [
|
||||
"Postcode",
|
||||
|
|
@ -51,6 +60,14 @@ _AREA_COLUMNS = [
|
|||
"Number of parks within 1km",
|
||||
"Distance to nearest train or tube station (km)",
|
||||
"Distance to nearest park (km)",
|
||||
"Distance to nearest grocery store (km)",
|
||||
"Distance to nearest tube station (km)",
|
||||
"Distance to nearest rail station (km)",
|
||||
"Distance to nearest Waitrose (km)",
|
||||
"Distance to nearest Tesco (km)",
|
||||
"Distance to nearest cafe (km)",
|
||||
"Distance to nearest pub (km)",
|
||||
"Distance to nearest restaurant (km)",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
|
|
@ -76,6 +93,34 @@ _AREA_COLUMNS = [
|
|||
]
|
||||
|
||||
|
||||
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
||||
return (
|
||||
column.startswith("Distance to nearest ")
|
||||
and column.endswith(" POI (km)")
|
||||
) or (
|
||||
column.startswith("Number of ")
|
||||
and (column.endswith(" POIs within 2km") or column.endswith(" POIs within 5km"))
|
||||
)
|
||||
|
||||
|
||||
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
||||
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
||||
non_null_count = pl.col(column).count()
|
||||
descending_rank = pl.col(column).rank("average", descending=True)
|
||||
return (
|
||||
pl.when(pl.col(column).is_null())
|
||||
.then(None)
|
||||
.when(pl.col(column) == pl.col(column).min())
|
||||
.then(100.0)
|
||||
.when(pl.col(column) == pl.col(column).max())
|
||||
.then(0.0)
|
||||
.when(non_null_count > 1)
|
||||
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
|
||||
.otherwise(100.0)
|
||||
.alias(column)
|
||||
)
|
||||
|
||||
|
||||
def _build(
|
||||
epc_pp_path: Path,
|
||||
arcgis_path: Path,
|
||||
|
|
@ -134,20 +179,11 @@ def _build(
|
|||
)
|
||||
wide = wide.join(arcgis, on="postcode", how="left")
|
||||
|
||||
iod = pl.scan_parquet(iod_path)
|
||||
iod = pl.scan_parquet(iod_path).with_columns(
|
||||
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
|
||||
)
|
||||
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
|
||||
# Invert deprivation scores so that higher values = less deprived (better)
|
||||
iod_score_cols = [
|
||||
"Education, Skills and Training Score",
|
||||
"Income Score (rate)",
|
||||
"Employment Score (rate)",
|
||||
"Health Deprivation and Disability Score",
|
||||
"Indoors Sub-domain Score",
|
||||
"Outdoors Sub-domain Score",
|
||||
]
|
||||
wide = wide.with_columns(*(pl.col(c).max() - pl.col(c) for c in iod_score_cols))
|
||||
|
||||
ethnicity = pl.scan_parquet(ethnicity_path)
|
||||
wide = wide.join(
|
||||
ethnicity,
|
||||
|
|
@ -351,6 +387,14 @@ def _build(
|
|||
"parks_1km": "Number of parks within 1km",
|
||||
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
|
||||
"tube_station_nearest_km": "Distance to nearest tube station (km)",
|
||||
"rail_station_nearest_km": "Distance to nearest rail station (km)",
|
||||
"waitrose_nearest_km": "Distance to nearest Waitrose (km)",
|
||||
"tesco_nearest_km": "Distance to nearest Tesco (km)",
|
||||
"cafe_nearest_km": "Distance to nearest cafe (km)",
|
||||
"pub_nearest_km": "Distance to nearest pub (km)",
|
||||
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
|
|
@ -381,10 +425,14 @@ def _build(
|
|||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
|
||||
area_cols.extend(
|
||||
c for c in df.columns if _is_dynamic_poi_metric_column(c) and c not in area_cols
|
||||
)
|
||||
area_col_set = set(area_cols)
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
|
||||
property_cols = [c for c in df.columns if c not in area_col_set or c == "Postcode"]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue