This commit is contained in:
Andras Schmelczer 2026-05-31 20:20:41 +01:00
parent 8688b7475e
commit e8345cbdc1
40 changed files with 1980 additions and 904 deletions

View file

@ -116,6 +116,66 @@ TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
_FINAL_DROP_COLUMNS = [
"inspection_date",
"_bedrooms",
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"Wider Barriers Sub-domain Score",
"Geographical Barriers Sub-domain Score",
"Adult Skills Sub-domain Score",
"Children and Young People Sub-domain Score",
"Crime Score",
"Living Environment Score",
"Index of Multiple Deprivation (IMD) Score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"oa21",
"pcon",
"epc_property_type",
"pp_property_type",
"built_form",
]
_FINAL_RENAME_COLUMNS = {
"date_of_transfer": "Date of last transaction",
"construction_age_band": "Construction year",
"is_construction_date_approximate": "Is construction date approximate",
"Income Score (rate)": "Income Score",
"Employment Score (rate)": "Employment Score",
"Indoors Sub-domain Score": "Housing Conditions Score",
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
"pp_address": "Address per Property Register",
"epc_address": "Address per EPC",
"postcode": "Postcode",
"duration": "Leasehold/Freehold",
"current_energy_rating": "Current energy rating",
"potential_energy_rating": "Potential energy rating",
"total_floor_area": "Total floor area (sqm)",
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
"median_age": "Median age",
"turnout_pct": "Voter turnout (%)",
}
_RENT_SOURCE_UNAVAILABLE_LADS = {
# ONS PIPR does not publish LAD-level private-rent estimates for these
# small authorities. Keep rent null there, but fail on any other LAD miss.
@ -707,6 +767,181 @@ def _validate_property_postcodes(df: pl.DataFrame) -> None:
)
def _active_english_postcode_area(arcgis_raw: pl.LazyFrame) -> pl.LazyFrame:
"""Return the supported postcode universe with geography join keys."""
return (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
"ctry25cd",
pl.col("lsoa21cd").alias("lsoa21"),
pl.col("oa21cd").alias("oa21"),
pl.col("pcon24cd").alias("pcon"),
)
.drop_nulls(["postcode"])
.unique(["postcode"])
)
def _remap_terminated_postcodes(
wide: pl.LazyFrame, postcode_mapping: pl.LazyFrame
) -> pl.LazyFrame:
return (
wide.join(
postcode_mapping,
left_on="postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
)
.drop("new_postcode")
)
def _filter_to_active_english_postcodes(
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
) -> pl.LazyFrame:
return wide.join(active_postcodes, on="postcode", how="semi")
def _join_area_side_tables(
base: pl.LazyFrame,
*,
iod: pl.LazyFrame,
ethnicity: pl.LazyFrame,
crime: pl.LazyFrame,
median_age: pl.LazyFrame,
election: pl.LazyFrame,
poi_counts: pl.LazyFrame,
noise: pl.LazyFrame,
school_proximity: pl.LazyFrame,
conservation_areas: pl.LazyFrame,
tree_density: pl.LazyFrame | None,
broadband: pl.LazyFrame,
) -> pl.LazyFrame:
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
base = base.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
base = base.join(crime, on="postcode", how="left")
base = base.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
)
base = base.join(median_age, on="lsoa21", how="left")
base = base.join(election, on="pcon", how="left")
base = base.join(poi_counts, on="postcode", how="left")
base = base.join(noise, on="postcode", how="left")
base = base.join(school_proximity, on="postcode", how="left")
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
if tree_density is not None:
base = base.join(tree_density, on="postcode", how="left")
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
return frame.drop(_FINAL_DROP_COLUMNS, strict=False).rename(
_FINAL_RENAME_COLUMNS, strict=False
)
def _area_columns_from(columns: list[str]) -> list[str]:
return [
c for c in columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
def _property_columns_from(columns: list[str]) -> list[str]:
return [
c
for c in columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
def _validate_postcode_feature_output(
postcode_df: pl.DataFrame, expected_postcode_count: int
) -> None:
required = {"Postcode", "lat", "lon", "ctry25cd"}
missing = sorted(required - set(postcode_df.columns))
if missing:
raise ValueError(f"Postcode feature output missing columns: {missing}")
unique_count = postcode_df["Postcode"].n_unique()
if (
postcode_df.height != expected_postcode_count
or unique_count != expected_postcode_count
):
raise ValueError(
"Postcode feature output no longer matches the active England "
"postcode universe: "
f"rows={postcode_df.height:,}, unique={unique_count:,}, "
f"expected={expected_postcode_count:,}"
)
invalid = postcode_df.filter(
pl.col("Postcode").is_null()
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
| pl.col("lat").is_null()
| pl.col("lon").is_null()
| pl.col("ctry25cd").is_null()
| (pl.col("ctry25cd") != "E92000001")
)
if invalid.height > 0:
sample = (
invalid.select("Postcode", "ctry25cd", "lat", "lon").head(10).to_dicts()
)
raise ValueError(
"Postcode feature output contains unsupported or ungeocoded rows: "
f"{invalid.height} rows. Sample: {sample}"
)
def _split_normal_outputs(
df: pl.DataFrame,
postcode_features: pl.DataFrame,
*,
expected_postcode_count: int,
) -> tuple[pl.DataFrame, pl.DataFrame]:
postcode_df = postcode_features.select(
_area_columns_from(postcode_features.columns)
)
_validate_postcode_feature_output(postcode_df, expected_postcode_count)
properties_df = df.select(_property_columns_from(df.columns))
return postcode_df, properties_df
# Map listings-parquet source columns to the `_actual_*` overlay columns
# carried alongside the wide frame through the postcode-keyed joins. After the
# rest of the pipeline finalises, listing rows pick their canonical dashboard
@ -927,9 +1162,7 @@ def _best_listing_match(
return best, float(best_score), "address", best_field
def _load_listings_for_merge(
listings_path: Path, arcgis_path: Path
) -> pl.DataFrame:
def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
"""Read the listings parquet and prepare it for the wide-frame merge.
Output is keyed by `_listing_idx` and carries:
@ -1032,7 +1265,11 @@ def _load_direct_epc_candidates(
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_uprn": pl.Utf8,
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
**{
column: dtype
for column, dtype in _DIRECT_EPC_COLUMNS
if column.startswith("_direct_")
},
}
if not listing_outcodes:
return pl.DataFrame(schema=schema)
@ -1089,9 +1326,7 @@ def _load_direct_epc_candidates(
pl.col("epc_address").alias("_direct_epc_address"),
pl.col("uprn").alias("_direct_epc_uprn"),
pl.col("total_floor_area").alias("_direct_total_floor_area"),
pl.col("number_habitable_rooms").alias(
"_direct_number_habitable_rooms"
),
pl.col("number_habitable_rooms").alias("_direct_number_habitable_rooms"),
pl.col("floor_height").alias("_direct_floor_height"),
pl.col("_direct_was_council_house").fill_null("No"),
)
@ -1141,9 +1376,7 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
)
def _optional_lazy_col(
schema: pl.Schema, column: str, dtype: pl.DataType
) -> pl.Expr:
def _optional_lazy_col(schema: pl.Schema, column: str, dtype: pl.DataType) -> pl.Expr:
if column in schema:
return pl.col(column).cast(dtype, strict=False).alias(column)
return pl.lit(None, dtype=dtype).alias(column)
@ -1640,27 +1873,18 @@ def _build(
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
# Remap terminated postcodes to nearest active successor
# Remap terminated postcodes to nearest active successor before filtering to
# the supported active-English postcode universe. Historical properties from
# terminated English postcodes are retained under their successor postcode.
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = (
wide.join(
postcode_mapping.lazy(),
left_on="postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
)
.drop("new_postcode")
)
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
arcgis_raw = pl.scan_parquet(arcgis_path)
postcode_country = arcgis_raw.select(
pl.col("pcds").alias("postcode"),
pl.col("ctry25cd"),
).unique(["postcode"])
wide = wide.join(postcode_country, on="postcode", how="left")
arcgis = _active_english_postcode_area(arcgis_raw)
active_postcodes = arcgis.select("postcode").unique()
active_postcode_count = (
active_postcodes.select(pl.len()).collect(engine="streaming").item()
)
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
if listed_buildings_path is not None:
active_postcodes_for_listed = (
@ -1691,92 +1915,25 @@ def _build(
arcgis_path,
epc_path=actual_listings_epc_path,
)
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
arcgis = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# Alias them back to the short canonical names used across the
# pipeline so downstream joins don't need to know about NSPL's
# versioning scheme.
.select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
pl.col("lsoa21cd").alias("lsoa21"),
pl.col("oa21cd").alias("oa21"),
pl.col("pcon24cd").alias("pcon"),
)
)
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# `_active_english_postcode_area` aliases them back to the short canonical
# names used across the pipeline so downstream joins don't need to know
# about NSPL's versioning scheme.
wide = wide.join(arcgis, on="postcode", how="left")
postcode_area = arcgis
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
ethnicity = pl.scan_parquet(ethnicity_path)
wide = wide.join(
ethnicity,
left_on="Local Authority District code (2024)",
right_on="Geography_code",
how="left",
)
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
wide = wide.with_columns(
(pl.col("number_habitable_rooms") - 1)
.clip(0, 4)
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
right_on=["area_code", "bedrooms"],
how="left",
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
crime = pl.scan_parquet(crime_path)
wide = wide.join(crime, on="postcode", how="left")
wide = wide.with_columns(
pl.sum_horizontal(
"Violence and sexual offences (avg/yr)",
"Robbery (avg/yr)",
"Burglary (avg/yr)",
"Possession of weapons (avg/yr)",
).alias("serious_crime_avg_yr"),
pl.sum_horizontal(
"Anti-social behaviour (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Shoplifting (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other theft (avg/yr)",
"Vehicle crime (avg/yr)",
"Public order (avg/yr)",
"Drugs (avg/yr)",
"Other crime (avg/yr)",
).alias("minor_crime_avg_yr"),
)
median_age = pl.scan_parquet(median_age_path)
wide = wide.join(median_age, on="lsoa21", how="left")
election = pl.scan_parquet(election_results_path)
wide = wide.join(election, on="pcon", how="left")
poi_counts = pl.scan_parquet(poi_proximity_path)
wide = wide.join(poi_counts, on="postcode", how="left")
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
noise = (
pl.scan_parquet(noise_path)
@ -1789,21 +1946,13 @@ def _build(
)
.select("postcode", "noise_lden_db")
)
wide = wide.join(noise, on="postcode", how="left")
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
tree_density = None
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
wide = wide.join(tree_density, on="postcode", how="left")
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
@ -1828,7 +1977,38 @@ def _build(
.agg(pl.col("max_download_speed").max())
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
)
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
area_side_tables = {
"iod": iod,
"ethnicity": ethnicity,
"crime": crime,
"median_age": median_age,
"election": election,
"poi_counts": poi_counts,
"noise": noise,
"school_proximity": school_proximity,
"conservation_areas": conservation_areas,
"tree_density": tree_density,
"broadband": broadband,
}
wide = _join_area_side_tables(wide, **area_side_tables)
postcode_area = _join_area_side_tables(postcode_area, **area_side_tables)
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
wide = wide.with_columns(
(pl.col("number_habitable_rooms") - 1)
.clip(0, 4)
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
right_on=["area_code", "bedrooms"],
how="left",
)
# Derive property_type: prefer EPC data, fall back to price-paid.
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
@ -1862,112 +2042,40 @@ def _build(
.alias("property_type")
)
wide = (
wide.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
)
.with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
.alias("Price per sqm"),
)
.drop(
"inspection_date",
"_bedrooms",
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"Wider Barriers Sub-domain Score",
"Geographical Barriers Sub-domain Score",
"Adult Skills Sub-domain Score",
"Children and Young People Sub-domain Score",
"Crime Score",
"Living Environment Score",
"Index of Multiple Deprivation (IMD) Score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"oa21",
"pcon",
"epc_property_type",
"pp_property_type",
"built_form",
)
.rename(
{
"date_of_transfer": "Date of last transaction",
"construction_age_band": "Construction year",
"is_construction_date_approximate": "Is construction date approximate",
"Income Score (rate)": "Income Score",
"Employment Score (rate)": "Employment Score",
"Indoors Sub-domain Score": "Housing Conditions Score",
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
"pp_address": "Address per Property Register",
"epc_address": "Address per EPC",
"postcode": "Postcode",
"duration": "Leasehold/Freehold",
"current_energy_rating": "Current energy rating",
"potential_energy_rating": "Potential energy rating",
"total_floor_area": "Total floor area (sqm)",
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_5km": "Good+ primary schools within 5km",
"good_secondary_5km": "Good+ secondary schools within 5km",
"good_primary_2km": "Good+ primary schools within 2km",
"good_secondary_2km": "Good+ secondary schools within 2km",
"outstanding_primary_5km": "Outstanding primary schools within 5km",
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
"outstanding_primary_2km": "Outstanding primary schools within 2km",
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
"median_age": "Median age",
"turnout_pct": "Voter turnout (%)",
}
)
wide = wide.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
).with_columns(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
.alias("Price per sqm"),
)
wide = _finalize_merged_columns(wide)
postcode_area = _finalize_merged_columns(postcode_area)
print("Collecting with streaming engine...")
df = wide.collect(engine="streaming")
if mode == "listings":
df = wide.collect(engine="streaming")
enriched_listings = _finalize_listings(df)
_validate_property_postcodes(enriched_listings)
print(f"Enriched listings rows: {enriched_listings.height}")
return _BuildResult(listings=enriched_listings)
df, postcode_features = pl.collect_all([wide, postcode_area], engine="streaming")
_validate_property_postcodes(df)
# Split into postcode-level and property-level dataframes
area_cols = [
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
postcode_df = df.select(area_cols).group_by("Postcode").first()
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=active_postcode_count
)
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
property_cols = [
c
for c in df.columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
properties_df = df.select(property_cols)
print(f"Property rows: {properties_df.height}")
return _BuildResult(postcode=postcode_df, properties=properties_df)