Fmt
This commit is contained in:
parent
63713c3a2b
commit
bd6b511f16
17 changed files with 544 additions and 377 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import argparse
|
||||
import re
|
||||
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
|
@ -57,9 +58,6 @@ _AREA_COLUMNS = [
|
|||
# Amenities
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
"Number of parks within 1km",
|
||||
"Distance to nearest train or tube station (km)",
|
||||
"Distance to nearest park (km)",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
|
|
@ -85,6 +83,17 @@ _AREA_COLUMNS = [
|
|||
]
|
||||
|
||||
|
||||
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
|
||||
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
|
||||
|
||||
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
||||
return bool(
|
||||
_DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
|
||||
)
|
||||
|
||||
|
||||
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
||||
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
||||
non_null_count = pl.col(column).count()
|
||||
|
|
@ -117,6 +126,7 @@ def _build(
|
|||
lsoa_population_path: Path,
|
||||
median_age_path: Path,
|
||||
election_results_path: Path,
|
||||
tree_density_addresses_path: Path | None = None,
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
|
|
@ -250,6 +260,18 @@ def _build(
|
|||
school_proximity = pl.scan_parquet(school_proximity_path)
|
||||
wide = wide.join(school_proximity, on="postcode", how="left")
|
||||
|
||||
if tree_density_addresses_path is not None:
|
||||
tree_density = (
|
||||
pl.scan_parquet(tree_density_addresses_path)
|
||||
.select(
|
||||
pl.col("postcode"),
|
||||
pl.col("pp_address"),
|
||||
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
|
||||
)
|
||||
.unique(["postcode", "pp_address"])
|
||||
)
|
||||
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
|
||||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
|
||||
|
|
@ -366,9 +388,6 @@ def _build(
|
|||
"property_type": "Property type",
|
||||
"restaurants_2km": "Number of restaurants within 2km",
|
||||
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
|
||||
"parks_1km": "Number of parks within 1km",
|
||||
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
|
|
@ -398,11 +417,18 @@ def _build(
|
|||
df = wide.collect(engine="streaming")
|
||||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
|
||||
area_cols = [
|
||||
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
|
||||
]
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
|
||||
property_cols = [
|
||||
c
|
||||
for c in df.columns
|
||||
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
|
||||
or c == "Postcode"
|
||||
]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
|
|
@ -481,6 +507,12 @@ def main():
|
|||
required=True,
|
||||
help="2024 General Election results by constituency parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tree-density-addresses",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Address-level tree density parquet from pipeline.transform.tree_density",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes",
|
||||
type=Path,
|
||||
|
|
@ -509,6 +541,7 @@ def main():
|
|||
lsoa_population_path=args.lsoa_population,
|
||||
median_age_path=args.median_age,
|
||||
election_results_path=args.election_results,
|
||||
tree_density_addresses_path=args.tree_density_addresses,
|
||||
)
|
||||
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue