Update data

This commit is contained in:
Andras Schmelczer 2026-05-14 08:17:10 +01:00
parent a4103b0896
commit 273d7a83ee
15 changed files with 716 additions and 316 deletions

View file

@ -22,6 +22,8 @@ _AREA_COLUMNS = [
"Postcode",
"lat",
"lon",
# Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd",
# Deprivation
"Income Score",
"Employment Score",
@ -86,6 +88,15 @@ _AREA_COLUMNS = [
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
_RENT_SOURCE_UNAVAILABLE_LADS = {
# ONS PIPR does not publish LAD-level private-rent estimates for these
# small authorities. Keep rent null there, but fail on any other LAD miss.
"E06000053": "Isles of Scilly",
"E09000001": "City of London",
}
def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -112,6 +123,107 @@ def _less_deprived_percentile_expr(column: str) -> pl.Expr:
)
def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
tree_density = pl.scan_parquet(tree_density_postcodes_path)
columns = set(tree_density.collect_schema().names())
if "postcode" not in columns:
raise ValueError(
f"{tree_density_postcodes_path} is missing required column: postcode"
)
if TREE_DENSITY_FEATURE in columns:
density_column = TREE_DENSITY_FEATURE
else:
candidates = sorted(
c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
)
if len(candidates) != 1:
raise ValueError(
f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
'or exactly one "Tree canopy density percentile within {radius}m" column; '
f"found {len(candidates)} postcode percentile columns"
)
density_column = candidates[0]
return (
tree_density.select(
pl.col("postcode"),
pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
)
.drop_nulls(["postcode"])
.unique(["postcode"])
)
def _validate_lad_source_coverage(
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
) -> None:
iod_lads = (
pl.read_parquet(
iod_path,
columns=[
"Local Authority District code (2024)",
"Local Authority District name (2024)",
],
)
.rename(
{
"Local Authority District code (2024)": "lad",
"Local Authority District name (2024)": "lad_name",
}
)
.unique(["lad"])
)
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
{"Geography_code": "lad"}
)
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing 2024 LAD coverage: "
f"{missing_ethnicity.to_dicts()}"
)
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
{"area_code": "lad"}
)
missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
unexpected_missing_rent = missing_rent.filter(
~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
)
if unexpected_missing_rent.height > 0:
raise ValueError(
"Rental data is missing 2024 LAD coverage: "
f"{unexpected_missing_rent.to_dicts()}"
)
if missing_rent.height > 0:
print(
"PIPR has no LAD-level rent estimates for source-unavailable LADs; "
f"rent will remain null there: {missing_rent.to_dicts()}"
)
def _validate_property_postcodes(df: pl.DataFrame) -> None:
invalid = df.filter(
pl.col("Postcode").is_null()
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
)
if invalid.height == 0:
return
sample_cols = [
col
for col in ("Postcode", "Address per Property Register", "Last known price")
if col in invalid.columns
]
sample = invalid.select(sample_cols).head(10).to_dicts()
raise ValueError(
"Property rows missing a postcode after merge: "
f"{invalid.height} rows. Sample: {sample}"
)
def _build(
epc_pp_path: Path,
arcgis_path: Path,
@ -126,12 +238,14 @@ def _build(
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_addresses_path: Path | None = None,
tree_density_postcodes_path: Path | None = None,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
Returns (postcode_df, properties_df).
"""
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
@ -152,9 +266,15 @@ def _build(
.drop("new_postcode")
)
arcgis_raw = pl.scan_parquet(arcgis_path)
postcode_country = arcgis_raw.select(
pl.col("pcds").alias("postcode"),
pl.col("ctry25cd"),
).unique(["postcode"])
wide = wide.join(postcode_country, on="postcode", how="left")
arcgis = (
pl.scan_parquet(arcgis_path)
.filter(pl.col("ctry25cd") == "E92000001") # England only
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# Alias them back to the short canonical names used across the
@ -191,7 +311,9 @@ def _build(
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
@ -260,17 +382,9 @@ def _build(
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
if tree_density_addresses_path is not None:
tree_density = (
pl.scan_parquet(tree_density_addresses_path)
.select(
pl.col("postcode"),
pl.col("pp_address"),
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
)
.unique(["postcode", "pp_address"])
)
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
wide = wide.join(tree_density, on="postcode", how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
@ -415,6 +529,7 @@ def _build(
print("Collecting with streaming engine...")
df = wide.collect(engine="streaming")
_validate_property_postcodes(df)
# Split into postcode-level and property-level dataframes
area_cols = [
@ -508,10 +623,10 @@ def main():
help="2024 General Election results by constituency parquet file",
)
parser.add_argument(
"--tree-density-addresses",
"--tree-density-postcodes",
type=Path,
required=False,
help="Address-level tree density parquet from pipeline.transform.tree_density",
help="Postcode-level tree density parquet from pipeline.transform.tree_density",
)
parser.add_argument(
"--output-postcodes",
@ -541,7 +656,7 @@ def main():
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_addresses_path=args.tree_density_addresses,
tree_density_postcodes_path=args.tree_density_postcodes,
)
print(f"\nPostcode columns: {postcode_df.columns}")