improve
This commit is contained in:
parent
8688b7475e
commit
e8345cbdc1
40 changed files with 1980 additions and 904 deletions
|
|
@ -116,6 +116,66 @@ TREE_DENSITY_FEATURE = "Street tree density percentile"
|
|||
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
|
||||
r"^Tree canopy density percentile within \d+m$"
|
||||
)
|
||||
_FINAL_DROP_COLUMNS = [
|
||||
"inspection_date",
|
||||
"_bedrooms",
|
||||
"LSOA name (2021)",
|
||||
"Local Authority District code (2024)",
|
||||
"Local Authority District name (2024)",
|
||||
"Wider Barriers Sub-domain Score",
|
||||
"Geographical Barriers Sub-domain Score",
|
||||
"Adult Skills Sub-domain Score",
|
||||
"Children and Young People Sub-domain Score",
|
||||
"Crime Score",
|
||||
"Living Environment Score",
|
||||
"Index of Multiple Deprivation (IMD) Score",
|
||||
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
|
||||
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
|
||||
"Barriers to Housing and Services Score",
|
||||
"oa21",
|
||||
"pcon",
|
||||
"epc_property_type",
|
||||
"pp_property_type",
|
||||
"built_form",
|
||||
]
|
||||
_FINAL_RENAME_COLUMNS = {
|
||||
"date_of_transfer": "Date of last transaction",
|
||||
"construction_age_band": "Construction year",
|
||||
"is_construction_date_approximate": "Is construction date approximate",
|
||||
"Income Score (rate)": "Income Score",
|
||||
"Employment Score (rate)": "Employment Score",
|
||||
"Indoors Sub-domain Score": "Housing Conditions Score",
|
||||
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
|
||||
"pp_address": "Address per Property Register",
|
||||
"epc_address": "Address per EPC",
|
||||
"postcode": "Postcode",
|
||||
"duration": "Leasehold/Freehold",
|
||||
"current_energy_rating": "Current energy rating",
|
||||
"potential_energy_rating": "Potential energy rating",
|
||||
"total_floor_area": "Total floor area (sqm)",
|
||||
"property_type": "Property type",
|
||||
"restaurants_2km": "Number of restaurants within 2km",
|
||||
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
"good_primary_5km": "Good+ primary schools within 5km",
|
||||
"good_secondary_5km": "Good+ secondary schools within 5km",
|
||||
"good_primary_2km": "Good+ primary schools within 2km",
|
||||
"good_secondary_2km": "Good+ secondary schools within 2km",
|
||||
"outstanding_primary_5km": "Outstanding primary schools within 5km",
|
||||
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
|
||||
"outstanding_primary_2km": "Outstanding primary schools within 2km",
|
||||
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
|
||||
"max_download_speed": "Max available download speed (Mbps)",
|
||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||
"mean_monthly_rent": "Estimated monthly rent",
|
||||
"floor_height": "Interior height (m)",
|
||||
"was_council_house": "Former council house",
|
||||
"median_age": "Median age",
|
||||
"turnout_pct": "Voter turnout (%)",
|
||||
}
|
||||
_RENT_SOURCE_UNAVAILABLE_LADS = {
|
||||
# ONS PIPR does not publish LAD-level private-rent estimates for these
|
||||
# small authorities. Keep rent null there, but fail on any other LAD miss.
|
||||
|
|
@ -707,6 +767,181 @@ def _validate_property_postcodes(df: pl.DataFrame) -> None:
|
|||
)
|
||||
|
||||
|
||||
def _active_english_postcode_area(arcgis_raw: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Return the supported postcode universe with geography join keys."""
|
||||
return (
|
||||
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
|
||||
.filter(pl.col("doterm").is_null())
|
||||
.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"lat",
|
||||
pl.col("long").alias("lon"),
|
||||
"ctry25cd",
|
||||
pl.col("lsoa21cd").alias("lsoa21"),
|
||||
pl.col("oa21cd").alias("oa21"),
|
||||
pl.col("pcon24cd").alias("pcon"),
|
||||
)
|
||||
.drop_nulls(["postcode"])
|
||||
.unique(["postcode"])
|
||||
)
|
||||
|
||||
|
||||
def _remap_terminated_postcodes(
|
||||
wide: pl.LazyFrame, postcode_mapping: pl.LazyFrame
|
||||
) -> pl.LazyFrame:
|
||||
return (
|
||||
wide.join(
|
||||
postcode_mapping,
|
||||
left_on="postcode",
|
||||
right_on="old_postcode",
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.coalesce("new_postcode", "postcode").alias("postcode"),
|
||||
)
|
||||
.drop("new_postcode")
|
||||
)
|
||||
|
||||
|
||||
def _filter_to_active_english_postcodes(
|
||||
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
|
||||
) -> pl.LazyFrame:
|
||||
return wide.join(active_postcodes, on="postcode", how="semi")
|
||||
|
||||
|
||||
def _join_area_side_tables(
|
||||
base: pl.LazyFrame,
|
||||
*,
|
||||
iod: pl.LazyFrame,
|
||||
ethnicity: pl.LazyFrame,
|
||||
crime: pl.LazyFrame,
|
||||
median_age: pl.LazyFrame,
|
||||
election: pl.LazyFrame,
|
||||
poi_counts: pl.LazyFrame,
|
||||
noise: pl.LazyFrame,
|
||||
school_proximity: pl.LazyFrame,
|
||||
conservation_areas: pl.LazyFrame,
|
||||
tree_density: pl.LazyFrame | None,
|
||||
broadband: pl.LazyFrame,
|
||||
) -> pl.LazyFrame:
|
||||
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
base = base.join(
|
||||
ethnicity,
|
||||
left_on="Local Authority District code (2024)",
|
||||
right_on="Geography_code",
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
base = base.join(crime, on="postcode", how="left")
|
||||
base = base.with_columns(
|
||||
pl.sum_horizontal(
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
).alias("serious_crime_avg_yr"),
|
||||
pl.sum_horizontal(
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
).alias("minor_crime_avg_yr"),
|
||||
)
|
||||
|
||||
base = base.join(median_age, on="lsoa21", how="left")
|
||||
base = base.join(election, on="pcon", how="left")
|
||||
base = base.join(poi_counts, on="postcode", how="left")
|
||||
base = base.join(noise, on="postcode", how="left")
|
||||
base = base.join(school_proximity, on="postcode", how="left")
|
||||
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
|
||||
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
|
||||
)
|
||||
if tree_density is not None:
|
||||
base = base.join(tree_density, on="postcode", how="left")
|
||||
return base.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
|
||||
|
||||
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
|
||||
return frame.drop(_FINAL_DROP_COLUMNS, strict=False).rename(
|
||||
_FINAL_RENAME_COLUMNS, strict=False
|
||||
)
|
||||
|
||||
|
||||
def _area_columns_from(columns: list[str]) -> list[str]:
|
||||
return [
|
||||
c for c in columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
|
||||
]
|
||||
|
||||
|
||||
def _property_columns_from(columns: list[str]) -> list[str]:
|
||||
return [
|
||||
c
|
||||
for c in columns
|
||||
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
|
||||
or c == "Postcode"
|
||||
]
|
||||
|
||||
|
||||
def _validate_postcode_feature_output(
|
||||
postcode_df: pl.DataFrame, expected_postcode_count: int
|
||||
) -> None:
|
||||
required = {"Postcode", "lat", "lon", "ctry25cd"}
|
||||
missing = sorted(required - set(postcode_df.columns))
|
||||
if missing:
|
||||
raise ValueError(f"Postcode feature output missing columns: {missing}")
|
||||
|
||||
unique_count = postcode_df["Postcode"].n_unique()
|
||||
if (
|
||||
postcode_df.height != expected_postcode_count
|
||||
or unique_count != expected_postcode_count
|
||||
):
|
||||
raise ValueError(
|
||||
"Postcode feature output no longer matches the active England "
|
||||
"postcode universe: "
|
||||
f"rows={postcode_df.height:,}, unique={unique_count:,}, "
|
||||
f"expected={expected_postcode_count:,}"
|
||||
)
|
||||
|
||||
invalid = postcode_df.filter(
|
||||
pl.col("Postcode").is_null()
|
||||
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
|
||||
| pl.col("lat").is_null()
|
||||
| pl.col("lon").is_null()
|
||||
| pl.col("ctry25cd").is_null()
|
||||
| (pl.col("ctry25cd") != "E92000001")
|
||||
)
|
||||
if invalid.height > 0:
|
||||
sample = (
|
||||
invalid.select("Postcode", "ctry25cd", "lat", "lon").head(10).to_dicts()
|
||||
)
|
||||
raise ValueError(
|
||||
"Postcode feature output contains unsupported or ungeocoded rows: "
|
||||
f"{invalid.height} rows. Sample: {sample}"
|
||||
)
|
||||
|
||||
|
||||
def _split_normal_outputs(
|
||||
df: pl.DataFrame,
|
||||
postcode_features: pl.DataFrame,
|
||||
*,
|
||||
expected_postcode_count: int,
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
postcode_df = postcode_features.select(
|
||||
_area_columns_from(postcode_features.columns)
|
||||
)
|
||||
_validate_postcode_feature_output(postcode_df, expected_postcode_count)
|
||||
properties_df = df.select(_property_columns_from(df.columns))
|
||||
return postcode_df, properties_df
|
||||
|
||||
|
||||
# Map listings-parquet source columns to the `_actual_*` overlay columns
|
||||
# carried alongside the wide frame through the postcode-keyed joins. After the
|
||||
# rest of the pipeline finalises, listing rows pick their canonical dashboard
|
||||
|
|
@ -927,9 +1162,7 @@ def _best_listing_match(
|
|||
return best, float(best_score), "address", best_field
|
||||
|
||||
|
||||
def _load_listings_for_merge(
|
||||
listings_path: Path, arcgis_path: Path
|
||||
) -> pl.DataFrame:
|
||||
def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
|
||||
"""Read the listings parquet and prepare it for the wide-frame merge.
|
||||
|
||||
Output is keyed by `_listing_idx` and carries:
|
||||
|
|
@ -1032,7 +1265,11 @@ def _load_direct_epc_candidates(
|
|||
"_direct_epc_outcode": pl.Utf8,
|
||||
"_direct_epc_canonical_property_type": pl.Utf8,
|
||||
"_direct_epc_uprn": pl.Utf8,
|
||||
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
|
||||
**{
|
||||
column: dtype
|
||||
for column, dtype in _DIRECT_EPC_COLUMNS
|
||||
if column.startswith("_direct_")
|
||||
},
|
||||
}
|
||||
if not listing_outcodes:
|
||||
return pl.DataFrame(schema=schema)
|
||||
|
|
@ -1089,9 +1326,7 @@ def _load_direct_epc_candidates(
|
|||
pl.col("epc_address").alias("_direct_epc_address"),
|
||||
pl.col("uprn").alias("_direct_epc_uprn"),
|
||||
pl.col("total_floor_area").alias("_direct_total_floor_area"),
|
||||
pl.col("number_habitable_rooms").alias(
|
||||
"_direct_number_habitable_rooms"
|
||||
),
|
||||
pl.col("number_habitable_rooms").alias("_direct_number_habitable_rooms"),
|
||||
pl.col("floor_height").alias("_direct_floor_height"),
|
||||
pl.col("_direct_was_council_house").fill_null("No"),
|
||||
)
|
||||
|
|
@ -1141,9 +1376,7 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
|||
)
|
||||
|
||||
|
||||
def _optional_lazy_col(
|
||||
schema: pl.Schema, column: str, dtype: pl.DataType
|
||||
) -> pl.Expr:
|
||||
def _optional_lazy_col(schema: pl.Schema, column: str, dtype: pl.DataType) -> pl.Expr:
|
||||
if column in schema:
|
||||
return pl.col(column).cast(dtype, strict=False).alias(column)
|
||||
return pl.lit(None, dtype=dtype).alias(column)
|
||||
|
|
@ -1640,27 +1873,18 @@ def _build(
|
|||
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
)
|
||||
|
||||
# Remap terminated postcodes to nearest active successor
|
||||
# Remap terminated postcodes to nearest active successor before filtering to
|
||||
# the supported active-English postcode universe. Historical properties from
|
||||
# terminated English postcodes are retained under their successor postcode.
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path)
|
||||
wide = (
|
||||
wide.join(
|
||||
postcode_mapping.lazy(),
|
||||
left_on="postcode",
|
||||
right_on="old_postcode",
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.coalesce("new_postcode", "postcode").alias("postcode"),
|
||||
)
|
||||
.drop("new_postcode")
|
||||
)
|
||||
|
||||
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
|
||||
arcgis_raw = pl.scan_parquet(arcgis_path)
|
||||
postcode_country = arcgis_raw.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
pl.col("ctry25cd"),
|
||||
).unique(["postcode"])
|
||||
wide = wide.join(postcode_country, on="postcode", how="left")
|
||||
arcgis = _active_english_postcode_area(arcgis_raw)
|
||||
active_postcodes = arcgis.select("postcode").unique()
|
||||
active_postcode_count = (
|
||||
active_postcodes.select(pl.len()).collect(engine="streaming").item()
|
||||
)
|
||||
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
|
||||
|
||||
if listed_buildings_path is not None:
|
||||
active_postcodes_for_listed = (
|
||||
|
|
@ -1691,92 +1915,25 @@ def _build(
|
|||
arcgis_path,
|
||||
epc_path=actual_listings_epc_path,
|
||||
)
|
||||
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
|
||||
|
||||
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
|
||||
|
||||
arcgis = (
|
||||
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
|
||||
.filter(pl.col("doterm").is_null()) # Active postcodes only
|
||||
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
|
||||
# Alias them back to the short canonical names used across the
|
||||
# pipeline so downstream joins don't need to know about NSPL's
|
||||
# versioning scheme.
|
||||
.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"lat",
|
||||
pl.col("long").alias("lon"),
|
||||
pl.col("lsoa21cd").alias("lsoa21"),
|
||||
pl.col("oa21cd").alias("oa21"),
|
||||
pl.col("pcon24cd").alias("pcon"),
|
||||
)
|
||||
)
|
||||
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
|
||||
# `_active_english_postcode_area` aliases them back to the short canonical
|
||||
# names used across the pipeline so downstream joins don't need to know
|
||||
# about NSPL's versioning scheme.
|
||||
wide = wide.join(arcgis, on="postcode", how="left")
|
||||
postcode_area = arcgis
|
||||
|
||||
iod = pl.scan_parquet(iod_path).with_columns(
|
||||
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
|
||||
)
|
||||
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
||||
|
||||
ethnicity = pl.scan_parquet(ethnicity_path)
|
||||
wide = wide.join(
|
||||
ethnicity,
|
||||
left_on="Local Authority District code (2024)",
|
||||
right_on="Geography_code",
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
|
||||
wide = wide.with_columns(
|
||||
(pl.col("number_habitable_rooms") - 1)
|
||||
.clip(0, 4)
|
||||
.cast(pl.UInt8)
|
||||
.alias("_bedrooms"),
|
||||
)
|
||||
rental = pl.scan_parquet(rental_prices_path).select(
|
||||
"area_code", "bedrooms", "mean_monthly_rent"
|
||||
)
|
||||
wide = wide.join(
|
||||
rental,
|
||||
left_on=["Local Authority District code (2024)", "_bedrooms"],
|
||||
right_on=["area_code", "bedrooms"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
crime = pl.scan_parquet(crime_path)
|
||||
wide = wide.join(crime, on="postcode", how="left")
|
||||
|
||||
wide = wide.with_columns(
|
||||
pl.sum_horizontal(
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
).alias("serious_crime_avg_yr"),
|
||||
pl.sum_horizontal(
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
).alias("minor_crime_avg_yr"),
|
||||
)
|
||||
|
||||
median_age = pl.scan_parquet(median_age_path)
|
||||
wide = wide.join(median_age, on="lsoa21", how="left")
|
||||
|
||||
election = pl.scan_parquet(election_results_path)
|
||||
wide = wide.join(election, on="pcon", how="left")
|
||||
|
||||
poi_counts = pl.scan_parquet(poi_proximity_path)
|
||||
wide = wide.join(poi_counts, on="postcode", how="left")
|
||||
|
||||
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
|
||||
noise = (
|
||||
pl.scan_parquet(noise_path)
|
||||
|
|
@ -1789,21 +1946,13 @@ def _build(
|
|||
)
|
||||
.select("postcode", "noise_lden_db")
|
||||
)
|
||||
wide = wide.join(noise, on="postcode", how="left")
|
||||
|
||||
school_proximity = pl.scan_parquet(school_proximity_path)
|
||||
wide = wide.join(school_proximity, on="postcode", how="left")
|
||||
|
||||
conservation_areas = _conservation_area_by_postcode(
|
||||
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
|
||||
)
|
||||
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
|
||||
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
|
||||
)
|
||||
|
||||
tree_density = None
|
||||
if tree_density_postcodes_path is not None:
|
||||
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
|
||||
wide = wide.join(tree_density, on="postcode", how="left")
|
||||
|
||||
# Broadband: derive max available download speed tier per postcode from
|
||||
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
||||
|
|
@ -1828,7 +1977,38 @@ def _build(
|
|||
.agg(pl.col("max_download_speed").max())
|
||||
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
|
||||
)
|
||||
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
||||
area_side_tables = {
|
||||
"iod": iod,
|
||||
"ethnicity": ethnicity,
|
||||
"crime": crime,
|
||||
"median_age": median_age,
|
||||
"election": election,
|
||||
"poi_counts": poi_counts,
|
||||
"noise": noise,
|
||||
"school_proximity": school_proximity,
|
||||
"conservation_areas": conservation_areas,
|
||||
"tree_density": tree_density,
|
||||
"broadband": broadband,
|
||||
}
|
||||
wide = _join_area_side_tables(wide, **area_side_tables)
|
||||
postcode_area = _join_area_side_tables(postcode_area, **area_side_tables)
|
||||
|
||||
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
|
||||
wide = wide.with_columns(
|
||||
(pl.col("number_habitable_rooms") - 1)
|
||||
.clip(0, 4)
|
||||
.cast(pl.UInt8)
|
||||
.alias("_bedrooms"),
|
||||
)
|
||||
rental = pl.scan_parquet(rental_prices_path).select(
|
||||
"area_code", "bedrooms", "mean_monthly_rent"
|
||||
)
|
||||
wide = wide.join(
|
||||
rental,
|
||||
left_on=["Local Authority District code (2024)", "_bedrooms"],
|
||||
right_on=["area_code", "bedrooms"],
|
||||
how="left",
|
||||
)
|
||||
|
||||
# Derive property_type: prefer EPC data, fall back to price-paid.
|
||||
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
|
||||
|
|
@ -1862,112 +2042,40 @@ def _build(
|
|||
.alias("property_type")
|
||||
)
|
||||
|
||||
wide = (
|
||||
wide.with_columns(
|
||||
pl.when(pl.col("duration") == "U")
|
||||
.then(None)
|
||||
.otherwise(pl.col("duration"))
|
||||
.alias("duration"),
|
||||
pl.when(pl.col("current_energy_rating") == "INVALID!")
|
||||
.then(None)
|
||||
.otherwise(pl.col("current_energy_rating"))
|
||||
.alias("current_energy_rating"),
|
||||
)
|
||||
.with_columns(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
.alias("Price per sqm"),
|
||||
)
|
||||
.drop(
|
||||
"inspection_date",
|
||||
"_bedrooms",
|
||||
"LSOA name (2021)",
|
||||
"Local Authority District code (2024)",
|
||||
"Local Authority District name (2024)",
|
||||
"Wider Barriers Sub-domain Score",
|
||||
"Geographical Barriers Sub-domain Score",
|
||||
"Adult Skills Sub-domain Score",
|
||||
"Children and Young People Sub-domain Score",
|
||||
"Crime Score",
|
||||
"Living Environment Score",
|
||||
"Index of Multiple Deprivation (IMD) Score",
|
||||
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
|
||||
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
|
||||
"Barriers to Housing and Services Score",
|
||||
"oa21",
|
||||
"pcon",
|
||||
"epc_property_type",
|
||||
"pp_property_type",
|
||||
"built_form",
|
||||
)
|
||||
.rename(
|
||||
{
|
||||
"date_of_transfer": "Date of last transaction",
|
||||
"construction_age_band": "Construction year",
|
||||
"is_construction_date_approximate": "Is construction date approximate",
|
||||
"Income Score (rate)": "Income Score",
|
||||
"Employment Score (rate)": "Employment Score",
|
||||
"Indoors Sub-domain Score": "Housing Conditions Score",
|
||||
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
|
||||
"pp_address": "Address per Property Register",
|
||||
"epc_address": "Address per EPC",
|
||||
"postcode": "Postcode",
|
||||
"duration": "Leasehold/Freehold",
|
||||
"current_energy_rating": "Current energy rating",
|
||||
"potential_energy_rating": "Potential energy rating",
|
||||
"total_floor_area": "Total floor area (sqm)",
|
||||
"property_type": "Property type",
|
||||
"restaurants_2km": "Number of restaurants within 2km",
|
||||
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
|
||||
"latest_price": "Last known price",
|
||||
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
||||
"noise_lden_db": "Noise (dB)",
|
||||
"good_primary_5km": "Good+ primary schools within 5km",
|
||||
"good_secondary_5km": "Good+ secondary schools within 5km",
|
||||
"good_primary_2km": "Good+ primary schools within 2km",
|
||||
"good_secondary_2km": "Good+ secondary schools within 2km",
|
||||
"outstanding_primary_5km": "Outstanding primary schools within 5km",
|
||||
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
|
||||
"outstanding_primary_2km": "Outstanding primary schools within 2km",
|
||||
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
|
||||
"max_download_speed": "Max available download speed (Mbps)",
|
||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||
"mean_monthly_rent": "Estimated monthly rent",
|
||||
"floor_height": "Interior height (m)",
|
||||
"was_council_house": "Former council house",
|
||||
"median_age": "Median age",
|
||||
"turnout_pct": "Voter turnout (%)",
|
||||
}
|
||||
)
|
||||
wide = wide.with_columns(
|
||||
pl.when(pl.col("duration") == "U")
|
||||
.then(None)
|
||||
.otherwise(pl.col("duration"))
|
||||
.alias("duration"),
|
||||
pl.when(pl.col("current_energy_rating") == "INVALID!")
|
||||
.then(None)
|
||||
.otherwise(pl.col("current_energy_rating"))
|
||||
.alias("current_energy_rating"),
|
||||
).with_columns(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
.alias("Price per sqm"),
|
||||
)
|
||||
wide = _finalize_merged_columns(wide)
|
||||
postcode_area = _finalize_merged_columns(postcode_area)
|
||||
|
||||
print("Collecting with streaming engine...")
|
||||
df = wide.collect(engine="streaming")
|
||||
|
||||
if mode == "listings":
|
||||
df = wide.collect(engine="streaming")
|
||||
enriched_listings = _finalize_listings(df)
|
||||
_validate_property_postcodes(enriched_listings)
|
||||
print(f"Enriched listings rows: {enriched_listings.height}")
|
||||
return _BuildResult(listings=enriched_listings)
|
||||
|
||||
df, postcode_features = pl.collect_all([wide, postcode_area], engine="streaming")
|
||||
_validate_property_postcodes(df)
|
||||
|
||||
# Split into postcode-level and property-level dataframes
|
||||
area_cols = [
|
||||
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
|
||||
]
|
||||
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
||||
postcode_df, properties_df = _split_normal_outputs(
|
||||
df, postcode_features, expected_postcode_count=active_postcode_count
|
||||
)
|
||||
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
||||
|
||||
property_cols = [
|
||||
c
|
||||
for c in df.columns
|
||||
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
|
||||
or c == "Postcode"
|
||||
]
|
||||
properties_df = df.select(property_cols)
|
||||
print(f"Property rows: {properties_df.height}")
|
||||
|
||||
return _BuildResult(postcode=postcode_df, properties=properties_df)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue