This commit is contained in:
Andras Schmelczer 2026-05-13 08:00:12 +01:00
parent 63713c3a2b
commit bd6b511f16
17 changed files with 544 additions and 377 deletions

View file

@ -1,4 +1,5 @@
import argparse
import re
import polars as pl
from pathlib import Path
@ -57,9 +58,6 @@ _AREA_COLUMNS = [
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
"Number of parks within 1km",
"Distance to nearest train or tube station (km)",
"Distance to nearest park (km)",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
@ -85,6 +83,17 @@ _AREA_COLUMNS = [
]
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
def _is_dynamic_poi_metric_column(column: str) -> bool:
return bool(
_DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
)
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
@ -117,6 +126,7 @@ def _build(
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_addresses_path: Path | None = None,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -250,6 +260,18 @@ def _build(
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
if tree_density_addresses_path is not None:
tree_density = (
pl.scan_parquet(tree_density_addresses_path)
.select(
pl.col("postcode"),
pl.col("pp_address"),
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
)
.unique(["postcode", "pp_address"])
)
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
@ -366,9 +388,6 @@ def _build(
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"parks_1km": "Number of parks within 1km",
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
"parks_nearest_km": "Distance to nearest park (km)",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
@ -398,11 +417,18 @@ def _build(
df = wide.collect(engine="streaming")
# Split into postcode-level and property-level dataframes
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
area_cols = [
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
postcode_df = df.select(area_cols).group_by("Postcode").first()
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
property_cols = [
c
for c in df.columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")
@ -481,6 +507,12 @@ def main():
required=True,
help="2024 General Election results by constituency parquet file",
)
parser.add_argument(
"--tree-density-addresses",
type=Path,
required=False,
help="Address-level tree density parquet from pipeline.transform.tree_density",
)
parser.add_argument(
"--output-postcodes",
type=Path,
@ -509,6 +541,7 @@ def main():
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_addresses_path=args.tree_density_addresses,
)
print(f"\nPostcode columns: {postcode_df.columns}")