Fmt
This commit is contained in:
parent
63713c3a2b
commit
bd6b511f16
17 changed files with 544 additions and 377 deletions
|
|
@ -1,4 +1,5 @@
|
|||
import argparse
|
||||
import re
|
||||
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
|
@ -57,9 +58,6 @@ _AREA_COLUMNS = [
|
|||
# Amenities
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
"Number of parks within 1km",
|
||||
"Distance to nearest train or tube station (km)",
|
||||
"Distance to nearest park (km)",
|
||||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
|
|
@ -85,6 +83,17 @@ _AREA_COLUMNS = [
|
|||
]
|
||||
|
||||
|
||||
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
|
||||
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
|
||||
|
||||
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
||||
return bool(
|
||||
_DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
|
||||
)
|
||||
|
||||
|
||||
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
||||
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
||||
non_null_count = pl.col(column).count()
|
||||
|
|
@ -117,6 +126,7 @@ def _build(
|
|||
lsoa_population_path: Path,
|
||||
median_age_path: Path,
|
||||
election_results_path: Path,
|
||||
tree_density_addresses_path: Path | None = None,
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
|
|
@ -250,6 +260,18 @@ def _build(
|
|||
school_proximity = pl.scan_parquet(school_proximity_path)
|
||||
wide = wide.join(school_proximity, on="postcode", how="left")
|
||||
|
||||
if tree_density_addresses_path is not None:
|
||||
tree_density = (
|
||||
pl.scan_parquet(tree_density_addresses_path)
|
||||
.select(
|
||||
pl.col("postcode"),
|
||||
pl.col("pp_address"),
|
||||
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
|
||||
)
|
||||
.unique(["postcode", "pp_address"])
|
||||
)
|
||||
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
|
||||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
|
||||
|
|
@ -366,9 +388,6 @@ def _build(
|
|||
"property_type": "Property type",
|
||||
"restaurants_2km": "Number of restaurants within 2km",
|
||||
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
|
||||
"parks_1km": "Number of parks within 1km",
|
||||
"train_tube_nearest_km": "Distance to nearest train or tube station (km)",
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
|
|
@ -398,11 +417,18 @@ def _build(
|
|||
df = wide.collect(engine="streaming")
|
||||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [c for c in _AREA_COLUMNS if c in df.columns]
|
||||
area_cols = [
|
||||
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
|
||||
]
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [c for c in df.columns if c not in _AREA_COLUMNS or c == "Postcode"]
|
||||
property_cols = [
|
||||
c
|
||||
for c in df.columns
|
||||
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
|
||||
or c == "Postcode"
|
||||
]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
|
|
@ -481,6 +507,12 @@ def main():
|
|||
required=True,
|
||||
help="2024 General Election results by constituency parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tree-density-addresses",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Address-level tree density parquet from pipeline.transform.tree_density",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes",
|
||||
type=Path,
|
||||
|
|
@ -509,6 +541,7 @@ def main():
|
|||
lsoa_population_path=args.lsoa_population,
|
||||
median_age_path=args.median_age,
|
||||
election_results_path=args.election_results,
|
||||
tree_density_addresses_path=args.tree_density_addresses,
|
||||
)
|
||||
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
|
|
|
|||
|
|
@ -17,27 +17,6 @@ POI_GROUPS_2KM = {
|
|||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
||||
}
|
||||
|
||||
# Groups for which to compute distance to nearest POI (from filtered POIs).
|
||||
# Keep `train_tube` for the existing backend feature; the individual POI
|
||||
# distance filters below power the frontend dropdown.
|
||||
DISTANCE_GROUPS = {
|
||||
"train_tube": ["Tube station", "Rail station"],
|
||||
"grocery_store": [
|
||||
"Greengrocer",
|
||||
"Supermarket",
|
||||
"Convenience Store",
|
||||
"Waitrose",
|
||||
"Tesco",
|
||||
],
|
||||
"tube_station": ["Tube station"],
|
||||
"rail_station": ["Rail station"],
|
||||
"waitrose": ["Waitrose"],
|
||||
"tesco": ["Tesco"],
|
||||
"cafe": ["Café"],
|
||||
"pub": ["Pub"],
|
||||
"restaurant": ["Restaurant"],
|
||||
}
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
||||
# of green spaces that are only mapped as polygons in OSM.
|
||||
|
|
@ -48,6 +27,7 @@ GREENSPACE_PARK_FUNCTIONS = {
|
|||
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
||||
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
|
||||
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
|
||||
DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"}
|
||||
|
||||
|
||||
def _poi_category_slug(category: str) -> str:
|
||||
|
|
@ -78,6 +58,7 @@ def _build_poi_category_groups(
|
|||
& (pl.col("len") > GROCERY_DYNAMIC_FILTER_MIN_POIS)
|
||||
)
|
||||
)
|
||||
.filter(~pl.col("category").is_in(list(DYNAMIC_FILTER_EXCLUDED_CATEGORIES)))
|
||||
.select("category")
|
||||
.sort("category")
|
||||
.to_series()
|
||||
|
|
@ -103,9 +84,11 @@ def _build_poi_category_groups(
|
|||
def _dynamic_poi_metric_renames(display_names: dict[str, str]) -> dict[str, str]:
|
||||
renames: dict[str, str] = {}
|
||||
for group_key, category in display_names.items():
|
||||
renames[f"{group_key}_nearest_km"] = f"Distance to nearest {category} POI (km)"
|
||||
renames[f"{group_key}_2km"] = f"Number of {category} POIs within 2km"
|
||||
renames[f"{group_key}_5km"] = f"Number of {category} POIs within 5km"
|
||||
renames[f"{group_key}_nearest_km"] = (
|
||||
f"Distance to nearest amenity ({category}) (km)"
|
||||
)
|
||||
renames[f"{group_key}_2km"] = f"Number of amenities ({category}) within 2km"
|
||||
renames[f"{group_key}_5km"] = f"Number of amenities ({category}) within 5km"
|
||||
return renames
|
||||
|
||||
|
||||
|
|
@ -139,12 +122,12 @@ def main():
|
|||
pois = pl.read_parquet(args.pois)
|
||||
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
||||
|
||||
# Count amenity POIs within 2km
|
||||
# Count static amenity groups within 2km.
|
||||
counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
|
||||
)
|
||||
|
||||
# Dynamic POI filters: nearest distance plus counts within 2km and 5km for
|
||||
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
|
||||
# the selected public transport, grocery, and leisure categories.
|
||||
dynamic_counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=poi_category_groups, radius_km=2
|
||||
|
|
@ -166,25 +149,37 @@ def main():
|
|||
{k: v for k, v in dynamic_renames.items() if k in dynamic_distances.columns}
|
||||
)
|
||||
|
||||
# Distance to nearest train/tube station (from filtered POIs)
|
||||
distances = min_distance_per_postcode(postcodes, pois, groups=DISTANCE_GROUPS)
|
||||
|
||||
# Park counts and distances from OS Open Greenspace
|
||||
# Park counts and distances from OS Open Greenspace. They use the dynamic
|
||||
# amenity metric names so filters read through the same side-table path as
|
||||
# OSM-derived amenity metrics.
|
||||
greenspace = pl.read_parquet(args.greenspace)
|
||||
park_counts_1km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=1
|
||||
park_counts_2km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=2
|
||||
)
|
||||
park_counts_5km = count_pois_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS, radius_km=5
|
||||
)
|
||||
park_distances = min_distance_per_postcode(
|
||||
postcodes, greenspace, groups=GREENSPACE_PARK_FUNCTIONS
|
||||
)
|
||||
park_renames = _dynamic_poi_metric_renames({"parks": "Park"})
|
||||
park_counts_2km = park_counts_2km.rename(
|
||||
{k: v for k, v in park_renames.items() if k in park_counts_2km.columns}
|
||||
)
|
||||
park_counts_5km = park_counts_5km.rename(
|
||||
{k: v for k, v in park_renames.items() if k in park_counts_5km.columns}
|
||||
)
|
||||
park_distances = park_distances.rename(
|
||||
{k: v for k, v in park_renames.items() if k in park_distances.columns}
|
||||
)
|
||||
|
||||
# Join all results on postcode
|
||||
result = (
|
||||
counts_2km.join(distances, on="postcode")
|
||||
.join(dynamic_counts_2km, on="postcode")
|
||||
counts_2km.join(dynamic_counts_2km, on="postcode")
|
||||
.join(dynamic_counts_5km, on="postcode")
|
||||
.join(dynamic_distances, on="postcode")
|
||||
.join(park_counts_1km, on="postcode")
|
||||
.join(park_counts_2km, on="postcode")
|
||||
.join(park_counts_5km, on="postcode")
|
||||
.join(park_distances, on="postcode")
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue