Update data
This commit is contained in:
parent
a4103b0896
commit
273d7a83ee
15 changed files with 716 additions and 316 deletions
|
|
@ -22,6 +22,8 @@ _AREA_COLUMNS = [
|
|||
"Postcode",
|
||||
"lat",
|
||||
"lon",
|
||||
# Runtime provenance for deciding whether missing coordinates are skippable.
|
||||
"ctry25cd",
|
||||
# Deprivation
|
||||
"Income Score",
|
||||
"Employment Score",
|
||||
|
|
@ -86,6 +88,15 @@ _AREA_COLUMNS = [
|
|||
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
|
||||
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
|
||||
r"^Tree canopy density percentile within \d+m$"
|
||||
)
|
||||
_RENT_SOURCE_UNAVAILABLE_LADS = {
|
||||
# ONS PIPR does not publish LAD-level private-rent estimates for these
|
||||
# small authorities. Keep rent null there, but fail on any other LAD miss.
|
||||
"E06000053": "Isles of Scilly",
|
||||
"E09000001": "City of London",
|
||||
}
|
||||
|
||||
|
||||
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
||||
|
|
@ -112,6 +123,107 @@ def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
|||
)
|
||||
|
||||
|
||||
def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
|
||||
tree_density = pl.scan_parquet(tree_density_postcodes_path)
|
||||
columns = set(tree_density.collect_schema().names())
|
||||
if "postcode" not in columns:
|
||||
raise ValueError(
|
||||
f"{tree_density_postcodes_path} is missing required column: postcode"
|
||||
)
|
||||
|
||||
if TREE_DENSITY_FEATURE in columns:
|
||||
density_column = TREE_DENSITY_FEATURE
|
||||
else:
|
||||
candidates = sorted(
|
||||
c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
|
||||
)
|
||||
if len(candidates) != 1:
|
||||
raise ValueError(
|
||||
f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
|
||||
'or exactly one "Tree canopy density percentile within {radius}m" column; '
|
||||
f"found {len(candidates)} postcode percentile columns"
|
||||
)
|
||||
density_column = candidates[0]
|
||||
|
||||
return (
|
||||
tree_density.select(
|
||||
pl.col("postcode"),
|
||||
pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
|
||||
)
|
||||
.drop_nulls(["postcode"])
|
||||
.unique(["postcode"])
|
||||
)
|
||||
|
||||
|
||||
def _validate_lad_source_coverage(
|
||||
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
|
||||
) -> None:
|
||||
iod_lads = (
|
||||
pl.read_parquet(
|
||||
iod_path,
|
||||
columns=[
|
||||
"Local Authority District code (2024)",
|
||||
"Local Authority District name (2024)",
|
||||
],
|
||||
)
|
||||
.rename(
|
||||
{
|
||||
"Local Authority District code (2024)": "lad",
|
||||
"Local Authority District name (2024)": "lad_name",
|
||||
}
|
||||
)
|
||||
.unique(["lad"])
|
||||
)
|
||||
|
||||
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
|
||||
{"Geography_code": "lad"}
|
||||
)
|
||||
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
|
||||
if missing_ethnicity.height > 0:
|
||||
raise ValueError(
|
||||
"Ethnicity data is missing 2024 LAD coverage: "
|
||||
f"{missing_ethnicity.to_dicts()}"
|
||||
)
|
||||
|
||||
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
|
||||
{"area_code": "lad"}
|
||||
)
|
||||
missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
|
||||
unexpected_missing_rent = missing_rent.filter(
|
||||
~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
|
||||
)
|
||||
if unexpected_missing_rent.height > 0:
|
||||
raise ValueError(
|
||||
"Rental data is missing 2024 LAD coverage: "
|
||||
f"{unexpected_missing_rent.to_dicts()}"
|
||||
)
|
||||
if missing_rent.height > 0:
|
||||
print(
|
||||
"PIPR has no LAD-level rent estimates for source-unavailable LADs; "
|
||||
f"rent will remain null there: {missing_rent.to_dicts()}"
|
||||
)
|
||||
|
||||
|
||||
def _validate_property_postcodes(df: pl.DataFrame) -> None:
|
||||
invalid = df.filter(
|
||||
pl.col("Postcode").is_null()
|
||||
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
|
||||
)
|
||||
if invalid.height == 0:
|
||||
return
|
||||
|
||||
sample_cols = [
|
||||
col
|
||||
for col in ("Postcode", "Address per Property Register", "Last known price")
|
||||
if col in invalid.columns
|
||||
]
|
||||
sample = invalid.select(sample_cols).head(10).to_dicts()
|
||||
raise ValueError(
|
||||
"Property rows missing a postcode after merge: "
|
||||
f"{invalid.height} rows. Sample: {sample}"
|
||||
)
|
||||
|
||||
|
||||
def _build(
|
||||
epc_pp_path: Path,
|
||||
arcgis_path: Path,
|
||||
|
|
@ -126,12 +238,14 @@ def _build(
|
|||
lsoa_population_path: Path,
|
||||
median_age_path: Path,
|
||||
election_results_path: Path,
|
||||
tree_density_addresses_path: Path | None = None,
|
||||
tree_density_postcodes_path: Path | None = None,
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
Returns (postcode_df, properties_df).
|
||||
"""
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
|
||||
|
||||
wide = pl.scan_parquet(epc_pp_path).filter(
|
||||
pl.col("total_floor_area").is_null()
|
||||
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
|
|
@ -152,9 +266,15 @@ def _build(
|
|||
.drop("new_postcode")
|
||||
)
|
||||
|
||||
arcgis_raw = pl.scan_parquet(arcgis_path)
|
||||
postcode_country = arcgis_raw.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
pl.col("ctry25cd"),
|
||||
).unique(["postcode"])
|
||||
wide = wide.join(postcode_country, on="postcode", how="left")
|
||||
|
||||
arcgis = (
|
||||
pl.scan_parquet(arcgis_path)
|
||||
.filter(pl.col("ctry25cd") == "E92000001") # England only
|
||||
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
|
||||
.filter(pl.col("doterm").is_null()) # Active postcodes only
|
||||
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
|
||||
# Alias them back to the short canonical names used across the
|
||||
|
|
@ -191,7 +311,9 @@ def _build(
|
|||
.cast(pl.UInt8)
|
||||
.alias("_bedrooms"),
|
||||
)
|
||||
rental = pl.scan_parquet(rental_prices_path)
|
||||
rental = pl.scan_parquet(rental_prices_path).select(
|
||||
"area_code", "bedrooms", "mean_monthly_rent"
|
||||
)
|
||||
wide = wide.join(
|
||||
rental,
|
||||
left_on=["Local Authority District code (2024)", "_bedrooms"],
|
||||
|
|
@ -260,17 +382,9 @@ def _build(
|
|||
school_proximity = pl.scan_parquet(school_proximity_path)
|
||||
wide = wide.join(school_proximity, on="postcode", how="left")
|
||||
|
||||
if tree_density_addresses_path is not None:
|
||||
tree_density = (
|
||||
pl.scan_parquet(tree_density_addresses_path)
|
||||
.select(
|
||||
pl.col("postcode"),
|
||||
pl.col("pp_address"),
|
||||
pl.col(TREE_DENSITY_FEATURE).cast(pl.Float32),
|
||||
)
|
||||
.unique(["postcode", "pp_address"])
|
||||
)
|
||||
wide = wide.join(tree_density, on=["postcode", "pp_address"], how="left")
|
||||
if tree_density_postcodes_path is not None:
|
||||
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
|
||||
wide = wide.join(tree_density, on="postcode", how="left")
|
||||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
|
|
@ -415,6 +529,7 @@ def _build(
|
|||
|
||||
print("Collecting with streaming engine...")
|
||||
df = wide.collect(engine="streaming")
|
||||
_validate_property_postcodes(df)
|
||||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [
|
||||
|
|
@ -508,10 +623,10 @@ def main():
|
|||
help="2024 General Election results by constituency parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tree-density-addresses",
|
||||
"--tree-density-postcodes",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Address-level tree density parquet from pipeline.transform.tree_density",
|
||||
help="Postcode-level tree density parquet from pipeline.transform.tree_density",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes",
|
||||
|
|
@ -541,7 +656,7 @@ def main():
|
|||
lsoa_population_path=args.lsoa_population,
|
||||
median_age_path=args.median_age,
|
||||
election_results_path=args.election_results,
|
||||
tree_density_addresses_path=args.tree_density_addresses,
|
||||
tree_density_postcodes_path=args.tree_density_postcodes,
|
||||
)
|
||||
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue