Improve data
This commit is contained in:
parent
b4d66a28c1
commit
85da1941aa
31 changed files with 901 additions and 319 deletions
|
|
@ -2,6 +2,7 @@ import argparse
|
|||
import re
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from datetime import date
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
|
|
@ -30,7 +31,10 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
CONSERVATION_AREA_FEATURE = "Within conservation area"
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
# Named "Tree canopy" (not "Street tree") because the underlying density unions
|
||||
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
|
||||
# woodland-edge postcode's score reflects forest canopy, not only street trees.
|
||||
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
|
||||
LISTED_BUILDING_FEATURE = "Listed building"
|
||||
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
||||
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
||||
|
|
@ -528,10 +532,22 @@ def _is_planning_conservation_area_record(dataset: object) -> bool:
|
|||
|
||||
|
||||
def _is_current_planning_record(end_date: object) -> bool:
|
||||
"""A planning record is current when it has no end-date OR its end-date is
|
||||
still in the future. The planning.data.gov.uk `end-date` field marks when a
|
||||
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
|
||||
area and must NOT be dropped — the previous "any non-empty date = ended"
|
||||
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
|
||||
if end_date is None:
|
||||
return True
|
||||
if isinstance(end_date, str):
|
||||
return end_date.strip() == ""
|
||||
text = end_date.strip()
|
||||
if text == "":
|
||||
return True
|
||||
try:
|
||||
return date.fromisoformat(text[:10]) > date.today()
|
||||
except ValueError:
|
||||
# Unparseable end-date: keep the record rather than silently drop it.
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
@ -706,8 +722,32 @@ def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame
|
|||
)
|
||||
|
||||
|
||||
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
|
||||
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
|
||||
|
||||
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
|
||||
like median age and IoD. The IoD table defines the LSOA universe every
|
||||
postcode resolves into, so a missing LSOA would silently null the ethnicity
|
||||
columns for those postcodes; require full coverage instead.
|
||||
"""
|
||||
iod_lsoas = pl.read_parquet(
|
||||
iod_path, columns=["LSOA code (2021)"]
|
||||
).rename({"LSOA code (2021)": "lsoa21"})
|
||||
|
||||
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
|
||||
missing_ethnicity = iod_lsoas.join(
|
||||
ethnicity_lsoas, on="lsoa21", how="anti"
|
||||
).sort("lsoa21")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing LSOA coverage: "
|
||||
f"{missing_ethnicity.height} LSOAs, e.g. "
|
||||
f"{missing_ethnicity.head(10).to_dicts()}"
|
||||
)
|
||||
|
||||
|
||||
def _validate_lad_source_coverage(
|
||||
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
|
||||
iod_path: Path, rental_prices_path: Path
|
||||
) -> None:
|
||||
iod_lads = (
|
||||
pl.read_parquet(
|
||||
|
|
@ -726,16 +766,6 @@ def _validate_lad_source_coverage(
|
|||
.unique(["lad"])
|
||||
)
|
||||
|
||||
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
|
||||
{"Geography_code": "lad"}
|
||||
)
|
||||
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing 2024 LAD coverage: "
|
||||
f"{missing_ethnicity.to_dicts()}"
|
||||
)
|
||||
|
||||
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
|
||||
{"area_code": "lad"}
|
||||
)
|
||||
|
|
@ -849,12 +879,10 @@ def _join_area_side_tables(
|
|||
broadband: pl.LazyFrame,
|
||||
) -> pl.LazyFrame:
|
||||
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
base = base.join(
|
||||
ethnicity,
|
||||
left_on="Local Authority District code (2024)",
|
||||
right_on="Geography_code",
|
||||
how="left",
|
||||
)
|
||||
# Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
|
||||
# `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
|
||||
# Local-Authority broadcast, with no change to the 6-bucket output schema.
|
||||
base = base.join(ethnicity, on="lsoa21", how="left")
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
|
||||
|
|
@ -1966,7 +1994,8 @@ def _build(
|
|||
"""
|
||||
if mode == "listings" and actual_listings_path is None:
|
||||
raise ValueError("listings mode requires actual_listings_path")
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
|
||||
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
|
||||
_validate_lad_source_coverage(iod_path, rental_prices_path)
|
||||
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
|
|
@ -2225,7 +2254,7 @@ def main():
|
|||
"--ethnicity",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Ethnicity by local authority parquet file (optional)",
|
||||
help="Census 2021 ethnic group (TS021) by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--crime",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue