perfect-postcode/pipeline/transform/merge.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

2675 lines
99 KiB
Python

import argparse
import re
import tempfile
from dataclasses import dataclass
from datetime import date
from typing import Literal
import numpy as np
import polars as pl
from pathlib import Path
import pyogrio
from pyproj import Transformer
from scipy.spatial import cKDTree
from shapely import from_wkb, points
from shapely.geometry.base import BaseGeometry
from shapely.strtree import STRtree
from thefuzz import fuzz
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates, epc_band_to_year
from pipeline.transform.price_estimation.knn import (
MAX_COMPARABLE_PSM,
MIN_COMPARABLE_PSM,
)
from pipeline.utils.fuzzy_join import (
_NUMBER_RE as _SUFFIXED_NUMBER_RE,
_numbers_compatible as _equal_numbers_compatible,
normalize_address_key,
normalize_postcode_key,
)
from pipeline.utils.normalize import drop_digit_tokens
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
# Named "Tree canopy" (not "Street tree") because the underlying density unions
# Forest Research TOW lone-tree/group crowns AND NFI woodland canopy, so a
# woodland-edge postcode's score reflects forest canopy, not only street trees.
TREE_DENSITY_FEATURE = "Tree canopy density percentile"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
LISTED_BUILDING_MIN_MATCH_SCORE = 95
PLANNING_DATA_CONSERVATION_AREA_DATASET = "conservation-area"
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
"Income Score (rate)",
"Employment Score (rate)",
"Health Deprivation and Disability Score",
"Indoors Sub-domain Score",
"Outdoors Sub-domain Score",
]
_AREA_COLUMNS = [
"Postcode",
"lat",
"lon",
# Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd",
# Join key for LSOA-level side tables (e.g. median age).
"lsoa21",
# Deprivation
"Income Score",
"Employment Score",
"Education, Skills and Training Score",
"Health Deprivation and Disability Score",
"Housing Conditions Score",
"Air Quality and Road Safety Score",
# Ethnicity
"% South Asian",
"% East Asian",
"% SE Asian",
"% Black",
"% Mixed",
"% White",
"% Other",
# Crime
"Anti-social behaviour (avg/yr)",
"Violence and sexual offences (avg/yr)",
"Criminal damage and arson (avg/yr)",
"Burglary (avg/yr)",
"Vehicle crime (avg/yr)",
"Robbery (avg/yr)",
"Other theft (avg/yr)",
"Shoplifting (avg/yr)",
"Drugs (avg/yr)",
"Possession of weapons (avg/yr)",
"Public order (avg/yr)",
"Bicycle theft (avg/yr)",
"Theft from the person (avg/yr)",
"Other crime (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
CONSERVATION_AREA_FEATURE,
# Tree canopy is a 50m-radius percentile around the postcode centroid, so it
# is postcode-grain: it belongs in the area output (one value per postcode,
# covering property-less postcodes too) rather than duplicated per property.
TREE_DENSITY_FEATURE,
# Schools (modelled historical catchment areas covering the postcode)
"Good+ primary school catchments",
"Good+ secondary school catchments",
"Outstanding primary school catchments",
"Outstanding secondary school catchments",
# Demographics
"Median age",
# Politics
"Voter turnout (%)",
"% Labour",
"% Conservative",
"% Liberal Democrat",
"% Reform UK",
"% Green",
"% Other parties",
]
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
_FINAL_DROP_COLUMNS = [
"inspection_date",
"_bedrooms",
"LSOA name (2021)",
"Local Authority District code (2024)",
"Local Authority District name (2024)",
"Wider Barriers Sub-domain Score",
"Geographical Barriers Sub-domain Score",
"Adult Skills Sub-domain Score",
"Children and Young People Sub-domain Score",
"Crime Score",
"Living Environment Score",
"Index of Multiple Deprivation (IMD) Score",
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"oa21",
"pcon",
"epc_property_type",
"pp_property_type",
"built_form",
]
_FINAL_RENAME_COLUMNS = {
"date_of_transfer": "Date of last transaction",
"construction_age_band": "Construction year",
"is_construction_date_approximate": "Is construction date approximate",
"Income Score (rate)": "Income Score",
"Employment Score (rate)": "Employment Score",
"Indoors Sub-domain Score": "Housing Conditions Score",
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
"pp_address": "Address per Property Register",
"epc_address": "Address per EPC",
"postcode": "Postcode",
"duration": "Leasehold/Freehold",
"current_energy_rating": "Current energy rating",
"potential_energy_rating": "Potential energy rating",
"total_floor_area": "Total floor area (sqm)",
"property_type": "Property type",
"restaurants_2km": "Number of restaurants within 2km",
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
"latest_price": "Last known price",
"number_habitable_rooms": "Number of bedrooms & living rooms",
"noise_lden_db": "Noise (dB)",
"good_primary_catchments": "Good+ primary school catchments",
"good_secondary_catchments": "Good+ secondary school catchments",
"outstanding_primary_catchments": "Outstanding primary school catchments",
"outstanding_secondary_catchments": "Outstanding secondary school catchments",
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
"median_age": "Median age",
"turnout_pct": "Voter turnout (%)",
}
_RENT_SOURCE_UNAVAILABLE_LADS = {
# ONS PIPR does not publish LAD-level private-rent estimates for these
# small authorities. Keep rent null there, but fail on any other LAD miss.
"E06000053": "Isles of Scilly",
"E09000001": "City of London",
}
_NUMBER_RE = re.compile(r"\d+")
_LISTED_NAME_STOP_WORDS = {
"A",
"AN",
"AND",
"AT",
"BY",
"IN",
"OF",
"ON",
"THE",
"TO",
"WITH",
}
def _is_dynamic_poi_metric_column(column: str) -> bool:
return bool(
_DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
)
def _subset_numbers_compatible(left: str, right: str) -> bool:
"""Require one side's numbers to be a subset of the other's.
Subset (not equality) is correct ONLY for listed-building name matching: a
list entry like "10-12 HIGH STREET" should flag "10 HIGH STREET". Address-
to-address matching must use the canonical `fuzzy_join._numbers_compatible`
instead (set equality over ``\\d+[A-Z]?`` tokens) — subset semantics there
let a single flat absorb its whole building (see fuzzy_join docstring).
"""
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
(left_nums, right_nums)
if len(left_nums) <= len(right_nums)
else (right_nums, left_nums)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _listed_candidate_schema() -> dict[str, pl.DataType]:
return {
"postcode": pl.Utf8,
"_listed_match_name": pl.Utf8,
"_listed_grade": pl.Utf8,
"_listed_entry": pl.Int64,
}
def _empty_listed_candidates() -> pl.DataFrame:
return pl.DataFrame(schema=_listed_candidate_schema())
def _empty_listed_property_flags() -> pl.DataFrame:
return pl.DataFrame(
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
LISTED_BUILDING_FEATURE: pl.Utf8,
}
)
def _is_matchable_listed_name(name_key: str | None) -> bool:
if not name_key:
return False
if _NUMBER_RE.search(name_key):
return True
substantive_tokens = [
token
for token in name_key.split()
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
]
return len(substantive_tokens) >= 2
def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
"""Load Historic England NHLE listed-building point attributes."""
columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
info = pyogrio.read_info(listed_buildings_path)
geometry_type = str(info.get("geometry_type") or "")
if "Point" not in geometry_type:
raise ValueError(
f"Expected listed-building point data, got geometry {geometry_type!r}"
)
_, table = pyogrio.read_arrow(
listed_buildings_path,
columns=columns,
read_geometry=False,
)
df = pl.from_arrow(table)
missing = sorted(set(columns) - set(df.columns))
if missing:
raise ValueError(
f"{listed_buildings_path} is missing listed-building columns: {missing}"
)
return (
df.select(
pl.col("ListEntry").cast(pl.Int64),
pl.col("Name").cast(pl.Utf8),
pl.col("Grade").cast(pl.Utf8),
pl.col("Easting").cast(pl.Float64),
pl.col("Northing").cast(pl.Float64),
)
.drop_nulls(["Name", "Easting", "Northing"])
.with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
.filter(pl.col("_listed_match_name").is_not_null())
)
def _postcode_listed_building_candidates(
listed_points: pl.DataFrame,
active_postcodes: pl.DataFrame,
*,
nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
) -> pl.DataFrame:
"""Assign each listed-building point to nearby active postcode candidates."""
if listed_points.is_empty() or active_postcodes.is_empty():
return _empty_listed_candidates()
required_postcode_cols = {"postcode", "east1m", "north1m"}
missing = sorted(required_postcode_cols - set(active_postcodes.columns))
if missing:
raise ValueError(f"Active postcode data missing required columns: {missing}")
required_listed_cols = {
"_listed_match_name",
"Grade",
"ListEntry",
"Easting",
"Northing",
}
missing = sorted(required_listed_cols - set(listed_points.columns))
if missing:
raise ValueError(f"Listed-building data missing required columns: {missing}")
postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
postcodes = postcodes.filter(
pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
)
listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
listed = listed.filter(
pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
)
if postcodes.is_empty() or listed.is_empty():
return _empty_listed_candidates()
postcode_coords = np.column_stack(
[postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
)
listed_coords = np.column_stack(
[listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
)
k = max(1, min(nearest_postcodes, postcodes.height))
distances, indices = cKDTree(postcode_coords).query(
listed_coords,
k=k,
distance_upper_bound=max_distance_m,
)
if k == 1:
distances = distances[:, np.newaxis]
indices = indices[:, np.newaxis]
postcode_values = postcodes["postcode"].to_list()
listed_names = listed["_listed_match_name"].to_list()
listed_grades = listed["Grade"].to_list()
listed_entries = listed["ListEntry"].to_list()
rows: list[tuple[str, str, str | None, int | None]] = []
for listed_idx in range(listed.height):
name_key = listed_names[listed_idx]
if not _is_matchable_listed_name(name_key):
continue
seen_postcodes: set[str] = set()
for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
if not np.isfinite(distance) or postcode_idx >= postcodes.height:
continue
postcode = postcode_values[int(postcode_idx)]
if postcode in seen_postcodes:
continue
seen_postcodes.add(postcode)
rows.append(
(
postcode,
name_key,
listed_grades[listed_idx],
listed_entries[listed_idx],
)
)
if not rows:
return _empty_listed_candidates()
return (
pl.DataFrame(
rows,
schema=[
"postcode",
"_listed_match_name",
"_listed_grade",
"_listed_entry",
],
orient="row",
)
.cast(_listed_candidate_schema())
.unique(["postcode", "_listed_match_name", "_listed_entry"])
)
def _matched_listed_building_flags(
properties: pl.LazyFrame,
listed_candidates: pl.DataFrame,
*,
min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
) -> pl.DataFrame:
"""Return property keys that conservatively match an NHLE listed entry."""
if listed_candidates.is_empty():
return _empty_listed_property_flags()
candidate_postcodes = listed_candidates.select("postcode").unique()
property_candidates = (
properties.select("postcode", "pp_address", "epc_address")
.join(candidate_postcodes.lazy(), on="postcode", how="semi")
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
)
.filter(
pl.col("pp_address").is_not_null()
& (
pl.col("_pp_match_address").is_not_null()
| pl.col("_epc_match_address").is_not_null()
)
)
.collect(engine="streaming")
)
if property_candidates.is_empty():
return _empty_listed_property_flags()
listed_by_postcode: dict[str, list[str]] = {}
for postcode, name in listed_candidates.select(
"postcode", "_listed_match_name"
).iter_rows():
if postcode and name:
listed_by_postcode.setdefault(postcode, []).append(name)
matches: list[tuple[str, str, str]] = []
for row in property_candidates.iter_rows(named=True):
postcode = row["postcode"]
listed_names = listed_by_postcode.get(postcode)
if not listed_names:
continue
address_keys = []
for col in ("_pp_match_address", "_epc_match_address"):
value = row.get(col)
if value and value not in address_keys:
address_keys.append(value)
matched = False
for address_key in address_keys:
for listed_name in listed_names:
if not _subset_numbers_compatible(address_key, listed_name):
continue
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
matched = True
break
if matched:
break
if matched:
matches.append((postcode, row["pp_address"], "Yes"))
if not matches:
return _empty_listed_property_flags()
return (
pl.DataFrame(
matches,
schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
orient="row",
)
.cast(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
LISTED_BUILDING_FEATURE: pl.Utf8,
}
)
.unique(["postcode", "pp_address"])
)
def _listed_building_flags(
properties: pl.LazyFrame,
active_postcodes: pl.DataFrame,
listed_buildings_path: Path,
) -> pl.DataFrame:
print(f"Loading listed-building points from {listed_buildings_path}...")
listed_points = _load_listed_building_points(listed_buildings_path)
print(f"Loaded {listed_points.height} listed-building point records")
listed_candidates = _postcode_listed_building_candidates(
listed_points, active_postcodes
)
print(
"Matching listed-building names to property addresses across "
f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
)
flags = _matched_listed_building_flags(properties, listed_candidates)
print(f"Matched {flags.height} property addresses to listed-building entries")
return flags
def _normalise_crs(crs: object | None) -> str:
return str(crs) if crs else "EPSG:4326"
def _geometry_column(metadata: dict, column_names: list[str]) -> str:
geometry_name = metadata.get("geometry_name")
if geometry_name:
return str(geometry_name)
for name in ("wkb_geometry", "geometry", "geom"):
if name in column_names:
return name
return column_names[-1]
def _column_values(table, column: str, default: object = None) -> list[object]:
if column not in table.column_names:
return [default] * table.num_rows
return table[column].combine_chunks().to_pylist()
def _is_planning_conservation_area_record(dataset: object) -> bool:
return (
dataset is None
or str(dataset).strip().casefold() == PLANNING_DATA_CONSERVATION_AREA_DATASET
)
def _is_current_planning_record(end_date: object) -> bool:
"""A planning record is current when it has no end-date OR its end-date is
still in the future. The planning.data.gov.uk `end-date` field marks when a
designation is RETIRED, so a future date (e.g. 2029-12-31) is a still-current
area and must NOT be dropped — the previous "any non-empty date = ended"
logic wrongly excluded those (e.g. 22 current Gateshead conservation areas)."""
if end_date is None:
return True
if isinstance(end_date, str):
text = end_date.strip()
if text == "":
return True
try:
return date.fromisoformat(text[:10]) > date.today()
except ValueError:
# Unparseable end-date: keep the record rather than silently drop it.
return True
return False
def _load_conservation_area_geometries(
conservation_areas_path: Path,
) -> tuple[list[BaseGeometry], str]:
metadata, table = pyogrio.read_arrow(conservation_areas_path)
geometry_name = _geometry_column(metadata, table.column_names)
datasets = _column_values(table, "dataset")
end_dates = _column_values(table, "end-date")
geometries = []
skipped_other_dataset = 0
skipped_ended = 0
skipped_non_polygon = 0
skipped_empty = 0
for dataset, end_date, geom in zip(
datasets,
end_dates,
from_wkb(table[geometry_name].combine_chunks().to_pylist()),
strict=True,
):
if not _is_planning_conservation_area_record(dataset):
skipped_other_dataset += 1
continue
if not _is_current_planning_record(end_date):
skipped_ended += 1
continue
if geom is None or geom.is_empty:
skipped_empty += 1
continue
if geom.geom_type not in {"Polygon", "MultiPolygon"}:
skipped_non_polygon += 1
continue
geometries.append(geom)
if not geometries:
raise ValueError(
f"{conservation_areas_path} does not contain any usable polygon geometries"
)
if skipped_other_dataset or skipped_ended or skipped_empty or skipped_non_polygon:
print(
"Skipped conservation-area records during load: "
f"other_dataset={skipped_other_dataset}, "
f"ended={skipped_ended}, "
f"empty_geometry={skipped_empty}, "
f"non_polygon={skipped_non_polygon}"
)
return geometries, _normalise_crs(metadata.get("crs"))
def _postcode_conservation_area_flags(
postcodes: pl.DataFrame,
conservation_geometries: list[BaseGeometry],
conservation_crs: object | None,
batch_size: int = 100_000,
) -> pl.DataFrame:
required = {"postcode", "lat", "lon"}
missing = sorted(required - set(postcodes.columns))
if missing:
raise ValueError(f"Postcode data missing required columns: {missing}")
all_postcodes = postcodes.select("postcode").drop_nulls().unique()
valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
if valid_points.is_empty():
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
lat = valid_points["lat"].to_numpy()
lon = valid_points["lon"].to_numpy()
finite = np.isfinite(lat) & np.isfinite(lon)
valid_points = valid_points.filter(pl.Series(finite))
if valid_points.is_empty():
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
lat = valid_points["lat"].to_numpy()
lon = valid_points["lon"].to_numpy()
transformer = Transformer.from_crs(
"EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
)
x, y = transformer.transform(lon, lat)
tree = STRtree(conservation_geometries)
inside = np.zeros(valid_points.height, dtype=bool)
for start in range(0, valid_points.height, batch_size):
end = min(start + batch_size, valid_points.height)
point_batch = points(x[start:end], y[start:end])
matches = tree.query(point_batch, predicate="intersects")
if matches.size > 0:
inside[start + matches[0]] = True
matched = (
valid_points.select("postcode")
.with_columns(pl.Series("_within_conservation_area", inside))
.group_by("postcode")
.agg(pl.col("_within_conservation_area").max())
.with_columns(
pl.when(pl.col("_within_conservation_area"))
.then(pl.lit("Yes"))
.otherwise(pl.lit("No"))
.alias(CONSERVATION_AREA_FEATURE)
)
.select("postcode", CONSERVATION_AREA_FEATURE)
)
return (
all_postcodes.join(matched, on="postcode", how="left")
.with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
.select("postcode", CONSERVATION_AREA_FEATURE)
)
def _conservation_area_by_postcode(
postcodes: pl.LazyFrame,
conservation_areas_path: Path,
) -> pl.LazyFrame:
print(f"Loading conservation area polygons from {conservation_areas_path}...")
geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
postcode_points = postcodes.select("postcode", "lat", "lon").collect(
engine="streaming"
)
print(
"Computing conservation area membership for "
f"{postcode_points.height} active English postcodes..."
)
return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
descending_rank = pl.col(column).rank("average", descending=True)
return (
pl.when(pl.col(column).is_null())
.then(None)
.when(pl.col(column) == pl.col(column).min())
.then(100.0)
.when(pl.col(column) == pl.col(column).max())
.then(0.0)
.when(non_null_count > 1)
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.alias(column)
)
def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
tree_density = pl.scan_parquet(tree_density_postcodes_path)
columns = set(tree_density.collect_schema().names())
if "postcode" not in columns:
raise ValueError(
f"{tree_density_postcodes_path} is missing required column: postcode"
)
if TREE_DENSITY_FEATURE in columns:
density_column = TREE_DENSITY_FEATURE
else:
candidates = sorted(
c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
)
if len(candidates) != 1:
raise ValueError(
f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
'or exactly one "Tree canopy density percentile within {radius}m" column; '
f"found {len(candidates)} postcode percentile columns"
)
density_column = candidates[0]
return (
tree_density.select(
pl.col("postcode"),
pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
)
.drop_nulls(["postcode"])
.unique(["postcode"])
)
def _validate_lsoa_source_coverage(iod_path: Path, ethnicity_path: Path) -> None:
"""Fail if ethnicity (now LSOA-keyed) misses any IoD LSOA.
Ethnicity is sourced from Census 2021 TS021 at LSOA, then joined on `lsoa21`
like median age and IoD. The IoD table defines the LSOA universe every
postcode resolves into, so a missing LSOA would silently null the ethnicity
columns for those postcodes; require full coverage instead.
"""
iod_lsoas = pl.read_parquet(
iod_path, columns=["LSOA code (2021)"]
).rename({"LSOA code (2021)": "lsoa21"})
ethnicity_lsoas = pl.read_parquet(ethnicity_path, columns=["lsoa21"])
missing_ethnicity = iod_lsoas.join(
ethnicity_lsoas, on="lsoa21", how="anti"
).sort("lsoa21")
if missing_ethnicity.height > 0:
raise ValueError(
"Ethnicity data is missing LSOA coverage: "
f"{missing_ethnicity.height} LSOAs, e.g. "
f"{missing_ethnicity.head(10).to_dicts()}"
)
def _validate_lad_source_coverage(
iod_path: Path, rental_prices_path: Path
) -> None:
iod_lads = (
pl.read_parquet(
iod_path,
columns=[
"Local Authority District code (2024)",
"Local Authority District name (2024)",
],
)
.rename(
{
"Local Authority District code (2024)": "lad",
"Local Authority District name (2024)": "lad_name",
}
)
.unique(["lad"])
)
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
{"area_code": "lad"}
)
missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
unexpected_missing_rent = missing_rent.filter(
~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
)
if unexpected_missing_rent.height > 0:
raise ValueError(
"Rental data is missing 2024 LAD coverage: "
f"{unexpected_missing_rent.to_dicts()}"
)
if missing_rent.height > 0:
print(
"PIPR has no LAD-level rent estimates for source-unavailable LADs; "
f"rent will remain null there: {missing_rent.to_dicts()}"
)
def _validate_property_postcodes(df: pl.DataFrame) -> None:
invalid = df.filter(
pl.col("Postcode").is_null()
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
)
if invalid.height == 0:
return
sample_cols = [
col
for col in ("Postcode", "Address per Property Register", "Last known price")
if col in invalid.columns
]
sample = invalid.select(sample_cols).head(10).to_dicts()
raise ValueError(
"Property rows missing a postcode after merge: "
f"{invalid.height} rows. Sample: {sample}"
)
def _active_english_postcode_area(arcgis_raw: pl.LazyFrame) -> pl.LazyFrame:
"""Return the supported postcode universe with geography join keys."""
return (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
"lat",
pl.col("long").alias("lon"),
"ctry25cd",
pl.col("lsoa21cd").alias("lsoa21"),
pl.col("oa21cd").alias("oa21"),
pl.col("pcon24cd").alias("pcon"),
)
.drop_nulls(["postcode"])
.unique(["postcode"])
)
def _remap_terminated_postcodes(
wide: pl.LazyFrame, postcode_mapping: pl.LazyFrame
) -> pl.LazyFrame:
return (
wide.join(
postcode_mapping,
left_on="postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "postcode").alias("postcode"),
)
.drop("new_postcode")
)
def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
"""Keep one row per (postcode, pp_address) — the most-recent transaction.
The terminated-postcode remap can map two distinct postcodes onto one active
successor, collapsing the same physical address onto a single
(postcode, pp_address) key with conflicting sale records. Keep the row with
the latest date_of_transfer so the headline price/date reflect the most
recent transaction; genuinely distinct addresses (a different pp_address) are
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
never merges unrelated rows.
"""
return wide.sort("date_of_transfer", descending=True, nulls_last=True).unique(
subset=["postcode", "pp_address"], keep="first", maintain_order=True
)
def _filter_to_active_english_postcodes(
wide: pl.LazyFrame, active_postcodes: pl.LazyFrame
) -> pl.LazyFrame:
return wide.join(active_postcodes, on="postcode", how="semi")
def _join_area_side_tables(
base: pl.LazyFrame,
*,
iod: pl.LazyFrame,
ethnicity: pl.LazyFrame,
crime: pl.LazyFrame,
median_age: pl.LazyFrame,
election: pl.LazyFrame,
poi_counts: pl.LazyFrame,
noise: pl.LazyFrame,
school_catchments: pl.LazyFrame,
conservation_areas: pl.LazyFrame,
tree_density: pl.LazyFrame | None,
broadband: pl.LazyFrame,
) -> pl.LazyFrame:
base = base.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
# Ethnicity is Census 2021 TS021 at LSOA (~33,755 areas), joined on the same
# `lsoa21` key as median age and IoD — a ~100x granularity gain over the old
# Local-Authority broadcast, with no change to the 6-bucket output schema.
base = base.join(ethnicity, on="lsoa21", how="left")
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA. crime_spatial
# precomputes the Serious/Minor headline rollups as the mean of the by-year
# rollup bars; read those straight through (renamed to the internal columns
# _finalize_merged_columns expects) rather than re-summing the per-type
# avg/yr columns — summing divides each type by its OWN years-present and
# overstates the rollup when types differ in coverage. A postcode absent from
# the crime table keeps null rollups via the left join (no fabricated zero);
# the per-type avg/yr columns pass through unchanged for display.
base = base.join(crime, on="postcode", how="left").rename(
{
"Serious crime (avg/yr)": "serious_crime_avg_yr",
"Minor crime (avg/yr)": "minor_crime_avg_yr",
}
)
base = base.join(median_age, on="lsoa21", how="left")
base = base.join(election, on="pcon", how="left")
base = base.join(poi_counts, on="postcode", how="left")
base = base.join(noise, on="postcode", how="left")
base = base.join(school_catchments, on="postcode", how="left")
base = base.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
if tree_density is not None:
base = base.join(tree_density, on="postcode", how="left")
# Broadband is the one side table sourced straight from a third-party CSV
# (Ofcom `postcode_space`) rather than from a sibling pcds-keyed pipeline
# step, so its postcode may drift in spacing/casing from the NSPL `pcds`
# base key. Normalize BOTH sides to the same canonical pcds form (reusing
# `_canonical_postcode_expr`, exactly as the listing/EPC re-anchor joins do)
# before joining, otherwise a real postcode silently misses and its
# `max_download_speed` reads as null "no data" downstream. Re-aggregate on
# the canonical key so two raw spellings collapsing to one key can't fan out
# the base; drop a null canonical key so an unparseable Ofcom row joins
# nothing rather than matching a null-key base row.
broadband_canonical = (
broadband.with_columns(
_canonical_postcode_expr("bb_postcode").alias("_bb_canonical_postcode")
)
.drop_nulls("_bb_canonical_postcode")
.group_by("_bb_canonical_postcode")
.agg(pl.col("max_download_speed").max())
)
return (
base.with_columns(
_canonical_postcode_expr("postcode").alias("_base_canonical_postcode")
)
.join(
broadband_canonical,
left_on="_base_canonical_postcode",
right_on="_bb_canonical_postcode",
how="left",
)
.drop("_base_canonical_postcode")
)
def _finalize_merged_columns(frame: pl.LazyFrame) -> pl.LazyFrame:
return frame.drop(_FINAL_DROP_COLUMNS, strict=False).rename(
_FINAL_RENAME_COLUMNS, strict=False
)
def _area_columns_from(columns: list[str]) -> list[str]:
return [
c for c in columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
]
def _property_columns_from(columns: list[str]) -> list[str]:
return [
c
for c in columns
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
or c == "Postcode"
]
def _validate_postcode_feature_output(
postcode_df: pl.DataFrame, expected_postcode_count: int
) -> None:
required = {"Postcode", "lat", "lon", "ctry25cd"}
missing = sorted(required - set(postcode_df.columns))
if missing:
raise ValueError(f"Postcode feature output missing columns: {missing}")
unique_count = postcode_df["Postcode"].n_unique()
if (
postcode_df.height != expected_postcode_count
or unique_count != expected_postcode_count
):
raise ValueError(
"Postcode feature output no longer matches the active England "
"postcode universe: "
f"rows={postcode_df.height:,}, unique={unique_count:,}, "
f"expected={expected_postcode_count:,}"
)
invalid = postcode_df.filter(
pl.col("Postcode").is_null()
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
| pl.col("lat").is_null()
| pl.col("lon").is_null()
| pl.col("ctry25cd").is_null()
| (pl.col("ctry25cd") != "E92000001")
)
if invalid.height > 0:
sample = (
invalid.select("Postcode", "ctry25cd", "lat", "lon").head(10).to_dicts()
)
raise ValueError(
"Postcode feature output contains unsupported or ungeocoded rows: "
f"{invalid.height} rows. Sample: {sample}"
)
def _split_normal_outputs(
df: pl.DataFrame,
postcode_features: pl.DataFrame,
*,
expected_postcode_count: int,
) -> tuple[pl.DataFrame, pl.DataFrame]:
postcode_df = postcode_features.select(
_area_columns_from(postcode_features.columns)
)
_validate_postcode_feature_output(postcode_df, expected_postcode_count)
properties_df = df.select(_property_columns_from(df.columns))
return postcode_df, properties_df
# Map listings-parquet source columns to the `_actual_*` overlay columns
# carried alongside the wide frame through the postcode-keyed joins. After the
# rest of the pipeline finalises, listing rows pick their canonical dashboard
# values from these overlays in `_finalize_listings`.
_LISTING_OVERLAY_SOURCES: tuple[tuple[str, str, pl.DataType], ...] = (
("Listing URL", "_actual_listing_url", pl.Utf8),
("Asking price", "_actual_asking_price", pl.Int64),
("Asking price per sqm", "_actual_asking_price_per_sqm", pl.Int32),
("Listing date", "_actual_listing_date", pl.Datetime("us")),
("Listing status", "_actual_listing_status", pl.Utf8),
("Listing features", "_actual_listing_features", pl.List(pl.Utf8)),
("Bedrooms", "_actual_bedrooms", pl.Int32),
("Bathrooms", "_actual_bathrooms", pl.Int32),
("Price qualifier", "_actual_price_qualifier", pl.Utf8),
("Property sub-type", "_actual_property_sub_type", pl.Utf8),
("lat", "_actual_lat", pl.Float64),
("lon", "_actual_lon", pl.Float64),
# Seeds for the wide row that an unmatched listing produces.
("Total floor area (sqm)", "_actual_total_floor_area", pl.Float64),
("Number of bedrooms & living rooms", "_actual_number_habitable_rooms", pl.Int16),
("Property type", "_actual_property_type", pl.Utf8),
("Leasehold/Freehold", "_actual_leasehold_freehold", pl.Utf8),
)
_LISTING_FLAG_COLUMN = "_actual_listing_url"
_TENURE_VALUES = ["Freehold", "Leasehold"]
_PROPERTY_TYPE_VALUES = [
"Detached",
"Semi-Detached",
"Terraced",
"Flats/Maisonettes",
"Other",
]
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
# Listings are matched to EPC certificates and Price-Paid properties first by
# UPRN (exact) and otherwise by fuzzy street-address similarity within the same
# postcode. A house number in the listing address is the strong disambiguator,
# so a numbered listing may match on a lower street-similarity score than a
# number-less one (which must match the street almost exactly to be trusted).
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
("_direct_epc_address", pl.Utf8),
("_direct_current_energy_rating", pl.Utf8),
("_direct_potential_energy_rating", pl.Utf8),
("_direct_total_floor_area", pl.Float64),
("_direct_number_habitable_rooms", pl.Int16),
("_direct_floor_height", pl.Float64),
("_direct_construction_age_band", pl.UInt16),
("_direct_is_construction_date_approximate", pl.UInt8),
("_direct_was_council_house", pl.Utf8),
("_direct_epc_match_status", pl.Utf8),
("_direct_epc_match_score", pl.Float32),
("_direct_epc_match_method", pl.Utf8),
)
_DIRECT_EPC_RAW_COLUMN_MAP = {
"epc_address": "_direct_epc_address",
"current_energy_rating": "_direct_current_energy_rating",
"potential_energy_rating": "_direct_potential_energy_rating",
"total_floor_area": "_direct_total_floor_area",
"number_habitable_rooms": "_direct_number_habitable_rooms",
"floor_height": "_direct_floor_height",
"construction_age_band": "_direct_construction_age_band",
"is_construction_date_approximate": "_direct_is_construction_date_approximate",
"was_council_house": "_direct_was_council_house",
}
def _canonical_postcode_expr(column: str) -> pl.Expr:
"""Re-format a postcode into NSPL `pcds` style (e.g. `AB1 2CD`) or null."""
compact = (
pl.col(column)
.cast(pl.Utf8)
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", "")
.str.strip_chars()
)
return (
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
.otherwise(None)
)
def _postcode_outcode_expr(column: str) -> pl.Expr:
return normalize_postcode_key(pl.col(column)).str.extract(
r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1
)
_OUTCODE_RE = re.compile(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$")
def _outcode_of(postcode: str | None) -> str | None:
"""Outcode of a compact normalised postcode ("BR15RW" -> "BR1")."""
if not postcode:
return None
match = _OUTCODE_RE.match(postcode)
return match.group(1) if match else None
def _canonical_epc_property_type_expr() -> pl.Expr:
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
return (
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(None)
.replace(
{
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
"House": "Other",
}
)
)
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
# Use the shared band->midpoint-year mapping so the direct-EPC / listings
# path matches join_epc_pp (band midpoint, not lower bound; 'before 1900' and
# implausible years -> null). Already-numeric inputs pass through unchanged.
return epc_band_to_year(pl.col(column))
def _address_score(query: str, candidate: str | None, *, allow_token_set: bool) -> int:
if not candidate:
return 0
# token_set_ratio returns 100 whenever the shorter token set is a subset of
# the longer. For a NUMBER-LESS query that is unsafe — a single locality
# token (e.g. "KINGSWOOD") subsets to 100 against any long address that
# merely contains it — so number-less queries score with token_sort_ratio
# only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
# query the unconditional fuzzy_join._numbers_compatible gate has already
# guaranteed the candidate carries identical house numbers, so token_set
# cannot inflate
# across different addresses; allowing it recovers genuine matches where the
# scraped listing appends trailing town/county tokens the bare register
# address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
# DRIVE").
if allow_token_set:
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
return fuzz.token_sort_ratio(query, candidate)
def _has_number(address: str | None) -> bool:
return bool(address and _NUMBER_RE.search(address))
def _enum_bonus(
left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
if not left or not right:
return 0.0
return exact if left == right else mismatch
def _ratio_bonus(
left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
if left is None or right is None:
return 0.0
try:
left_f = float(left)
right_f = float(right)
except (TypeError, ValueError):
return 0.0
if left_f <= 0 or right_f <= 0:
return 0.0
rel = abs(left_f - right_f) / max(left_f, right_f)
if rel > pct:
return 0.0
return cap * (1.0 - rel / pct)
def _rooms_bonus(left: int | None, right: int | None) -> float:
if left is None or right is None:
return 0.0
try:
diff = abs(int(left) - int(right))
except (TypeError, ValueError):
return 0.0
if diff == 0:
return 4.0
if diff == 1:
return 2.0
return 0.0
def _street_only_address(address: str) -> str:
"""The street/locality part of a normalised address: digit-bearing tokens
(house numbers, flat numbers, including letter suffixes like 8A) removed."""
return drop_digit_tokens(address)
def _is_specific_street_query(query: str) -> bool:
"""Whether a number-less listing address is specific enough for the
street-level fallback. token_set_ratio scores 100 whenever the query's
tokens are a subset of the candidate's, so a one-token query (a bare named
house like "KINGSWOOD") would match any street containing that word;
require at least two substantive tokens ("OLDSTEAD ROAD ...") instead."""
substantive = [
token
for token in query.split()
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
]
return len(substantive) >= 2
def _normalize_uprn(value: object) -> str | None:
"""Canonical UPRN string (digits only) or None.
UPRNs arrive as strings or ints from the scraper / EPC register; normalise
so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
silently mangled into a bogus all-digits key.
"""
if value is None:
return None
if isinstance(value, float):
if not value.is_integer():
return None
value = int(value)
digits = re.sub(r"\D", "", str(value))
return digits or None
def _best_listing_match(
listing_uprn: str | None,
query: str | None,
uprn_index: dict[str, dict],
bucket_candidates: list[dict],
addressed_fields: list[str],
) -> tuple[dict, float, str, str | None] | None:
"""Pick the best candidate for a listing.
Matching is, in order: (1) an exact UPRN equality against the global
``uprn_index`` (postcode-independent, so it is robust even when the
listing's postcode is slightly off); (2) failing that, the highest
fuzzy street-address similarity within the listing's own postcode bucket.
No property-attribute heuristics are used — `fuzzy_join._numbers_compatible`
gates every fuzzy match unconditionally (so a number-less listing can never
match a numbered property, and vice versa), as in the canonical
`fuzzy_join._score_bucket`. A house number additionally lowers the score
threshold and (via `_address_score`) permits token_set scoring; a number-less
address scores on token_sort only and must match the street almost exactly.
The direct-EPC path layers a street-level fallback on top of this strict
matcher — see `_best_street_epc_fallback`.
``addressed_fields`` names the candidate columns to fuzzy-match against (a
candidate may carry both a register and an EPC address). Returns
``(candidate, score, method, matched_field)`` or None. ``method`` is
"uprn" or "address"; ``matched_field`` is the winning address column (or
None for a UPRN match).
"""
if listing_uprn:
hit = uprn_index.get(listing_uprn)
if hit is not None:
return hit, 100.0, "uprn", None
if not query:
return None
listing_has_numbers = _has_number(query)
best: dict | None = None
best_score = 0
best_field: str | None = None
for candidate in bucket_candidates:
for field in addressed_fields:
address = candidate.get(field)
if not address:
continue
# Unconditional number gate (the canonical fuzzy_join one: set
# equality over suffix-aware tokens): a number-less listing cannot
# match a numbered candidate, 8A cannot match 8B, and a flat
# cannot absorb its whole building.
if not _equal_numbers_compatible(query, address):
continue
score = _address_score(query, address, allow_token_set=listing_has_numbers)
if score > best_score:
best_score = score
best = candidate
best_field = field
if best is None:
return None
threshold = (
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if best_score < threshold:
return None
return best, float(best_score), "address", best_field
# Ranking bonuses for the street-level direct-EPC fallback. A certificate in
# the listing's own postcode unit is the nearest segment of the street, and a
# certificate sharing a house-number token with the listing (e.g. listing
# "751 753 Cranbrook Road" vs certificate "751 Cranbrook Road", which fails the
# strict set-equality gate) is almost certainly the right property — both
# should beat a bare attribute-agreement win.
_STREET_FALLBACK_SAME_POSTCODE_BONUS = 3.0
_STREET_FALLBACK_NUMBER_OVERLAP_BONUS = 8.0
def _best_street_epc_fallback(
listing: dict,
outcode_streets: dict[str, list[dict]] | None,
outcode_noise_tokens: set[str],
street_score_cache: dict[tuple[str, str], list[tuple[int, str]]],
) -> tuple[dict, float, str, None] | None:
"""Street-level direct-EPC fallback for listings the strict matcher missed.
~90% of scraped listings publish a street-level address only ("Oldstead
Road, Bromley" — Rightmove never exposes the house number or UPRN), so the
strict matcher in `_best_listing_match` can never match them against the
virtually-always-numbered EPC register and their EPC-derived fields
(energy rating, interior height, former-council-house flag, construction
year) would all be null. Such a listing is instead matched to the best EPC
certificate on the SAME STREET in its own OUTCODE: long streets span
several postcode units, so postcode-only buckets missed ~43% of otherwise
matchable listings (funnel-measured on 2026-06 data). Street identity is
token_set_ratio between the digit-stripped halves of both addresses (every
same-street certificate scores ~100); qualifying certificates are ranked
by attribute agreement (property type, floor area, habitable rooms) plus
a same-postcode-unit preference and a house-number-overlap bonus (a
numbered listing that failed the strict set-equality gate, e.g. a
"751 753" range vs "751", still lands on the right property). The result
is street-representative rather than property-exact — hence the distinct
"street" method label so downstream consumers can tell the two confidence
levels apart. Applied to the direct-EPC join only; the property-register
(sale history) join stays strict because a price is property-exact in a
way an energy band is not.
``street_score_cache`` memoises the per-(outcode, query-street) fuzzy scan
over the outcode's unique street keys: listings on the same street share
the scan, which keeps the full-register run to seconds.
"""
query = listing.get("_listing_match_address")
if not query or not outcode_streets:
return None
query_street = _street_only_address(query)
if not query_street or not _is_specific_street_query(query_street):
return None
outcode = (
listing.get("_listing_outcode")
or _outcode_of(listing.get("_listing_match_postcode"))
or ""
)
cache_key = (outcode, query_street)
qualifying = street_score_cache.get(cache_key)
if qualifying is None:
# A qualifying street must be anchored by a shared token that is NOT a
# locality suffix of this outcode (see _index_epc_streets), so a
# town-only address can't subset-inflate onto an arbitrary street.
query_tokens = set(query_street.split())
qualifying = [
(score, street)
for street in outcode_streets
if (query_tokens & set(street.split())) - outcode_noise_tokens
and (score := fuzz.token_set_ratio(query_street, street))
>= _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
]
street_score_cache[cache_key] = qualifying
listing_postcode = listing.get("_listing_match_postcode")
listing_numbers = set(_SUFFIXED_NUMBER_RE.findall(query))
best: dict | None = None
best_total = float("-inf")
best_street_score = 0
for street_score, street in qualifying:
for candidate in outcode_streets[street]:
total = float(street_score)
total += _enum_bonus(
listing.get("_actual_property_type"),
candidate.get("_direct_epc_canonical_property_type"),
exact=6.0,
mismatch=-6.0,
)
total += _ratio_bonus(
listing.get("_actual_total_floor_area"),
candidate.get("_direct_total_floor_area"),
pct=0.12,
cap=8.0,
)
total += _rooms_bonus(
listing.get("_actual_number_habitable_rooms"),
candidate.get("_direct_number_habitable_rooms"),
)
if (
listing_postcode
and candidate.get("_direct_epc_match_postcode") == listing_postcode
):
total += _STREET_FALLBACK_SAME_POSTCODE_BONUS
if listing_numbers and listing_numbers & set(
_SUFFIXED_NUMBER_RE.findall(
candidate.get("_direct_epc_match_address") or ""
)
):
total += _STREET_FALLBACK_NUMBER_OVERLAP_BONUS
if total > best_total:
best_total = total
best = candidate
best_street_score = street_score
if best is None:
return None
return best, float(best_street_score), "street", None
def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
"""Read the listings parquet and prepare it for the wide-frame merge.
Output is keyed by `_listing_idx` and carries:
* `postcode` — canonical (NSPL `pcds`) form, with terminated postcodes
remapped to their nearest active successor;
* `pp_address` — the listing's raw register address (used as the
address half of the fuzzy match);
* one `_actual_*` overlay column per `_LISTING_OVERLAY_SOURCES` entry.
"""
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
# UPRN is only present on scraped listings that carry it (Zoopla detail
# pages); tolerate its absence so older parquets and test fixtures still
# load. Digits-only so it compares equal to the EPC register's UPRN.
if "UPRN" in raw.collect_schema().names():
# Mirror `_normalize_uprn` exactly so the listing key compares equal to
# the candidate-side key for every dtype. For a Float UPRN we must
# stringify via its integer form (100023336956.0 -> "100023336956"),
# otherwise stripping non-digits from "100023336956.0" yields a bogus
# trailing-zero key ("1000233369560") that never collides; and a
# non-integral float (e.g. 1.5) must be rejected rather than mangled.
uprn_col = pl.col("UPRN")
if raw.collect_schema()["UPRN"].is_float():
integral = uprn_col.cast(pl.Int64, strict=False)
uprn_digits = (
pl.when(integral == uprn_col)
.then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
.otherwise(None)
)
else:
uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
listing_uprn_expr = (
pl.when(uprn_digits.str.len_chars() > 0)
.then(uprn_digits)
.otherwise(None)
.alias("_listing_uprn")
)
else:
listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
# treats NaN as distinct from null and the downstream `latest_price /
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
# to null at load time.
def _overlay_expr(src: str, dst: str, dtype: pl.DataType) -> pl.Expr:
expr = pl.col(src).cast(dtype, strict=False)
if dtype in (pl.Float32, pl.Float64):
expr = expr.fill_nan(None)
return expr.alias(dst)
overlay = [
_overlay_expr(src, dst, dtype) for src, dst, dtype in _LISTING_OVERLAY_SOURCES
]
return (
raw.with_columns(
_canonical_postcode_expr("Postcode").alias("_canonical_postcode"),
)
.join(
postcode_mapping,
left_on="_canonical_postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "_canonical_postcode", "Postcode").alias(
"postcode"
),
pl.col("Address per Property Register").alias("pp_address"),
listing_uprn_expr,
*overlay,
)
.select(
"_listing_idx",
"postcode",
"pp_address",
"_listing_uprn",
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
)
.collect(engine="streaming")
)
def _ensure_direct_epc_columns(df: pl.DataFrame) -> pl.DataFrame:
missing_exprs = [
pl.lit(None, dtype=dtype).alias(column)
for column, dtype in _DIRECT_EPC_COLUMNS
if column not in df.columns
]
if not missing_exprs:
return df
return df.with_columns(missing_exprs)
def _direct_epc_match_schema() -> dict[str, pl.DataType]:
return {
"_listing_idx": pl.UInt32,
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS},
}
def _empty_direct_epc_matches() -> pl.DataFrame:
return pl.DataFrame(schema=_direct_epc_match_schema())
def _load_direct_epc_candidates(
epc_path: Path,
listing_outcodes: list[str],
temp_dir: Path,
) -> pl.DataFrame:
schema = {
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_uprn": pl.Utf8,
**{
column: dtype
for column, dtype in _DIRECT_EPC_COLUMNS
if column.startswith("_direct_")
},
}
if not listing_outcodes:
return pl.DataFrame(schema=schema)
epc_base = (
_scan_epc_certificates(epc_path, temp_dir)
.with_columns(
normalize_address_key(pl.col("epc_address")).alias(
"_direct_epc_match_address"
),
normalize_postcode_key(pl.col("epc_postcode")).alias(
"_direct_epc_match_postcode"
),
)
.with_columns(
pl.col("_direct_epc_match_postcode")
.str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1)
.alias("_direct_epc_outcode")
)
.filter(pl.col("_direct_epc_outcode").is_in(listing_outcodes))
.filter(pl.col("_direct_epc_match_address").is_not_null())
.filter(pl.col("_direct_epc_match_postcode").is_not_null())
)
social_tenure = (
epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("_direct_epc_match_address", "_direct_epc_match_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
)
return (
epc_base.sort("inspection_date", descending=True, nulls_last=True)
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
.first()
.join(
social_tenure,
on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
how="left",
)
.with_columns(
_canonical_epc_property_type_expr().alias(
"_direct_epc_canonical_property_type"
),
_construction_year_expr().alias("_direct_construction_age_band"),
pl.when(pl.col("current_energy_rating").is_in(_EPC_RATING_VALUES))
.then(pl.col("current_energy_rating"))
.otherwise(None)
.alias("_direct_current_energy_rating"),
pl.when(pl.col("potential_energy_rating").is_in(_EPC_RATING_VALUES))
.then(pl.col("potential_energy_rating"))
.otherwise(None)
.alias("_direct_potential_energy_rating"),
pl.col("epc_address").alias("_direct_epc_address"),
pl.col("uprn").alias("_direct_epc_uprn"),
pl.col("total_floor_area").alias("_direct_total_floor_area"),
pl.col("number_habitable_rooms").alias("_direct_number_habitable_rooms"),
pl.col("floor_height").alias("_direct_floor_height"),
pl.col("_direct_was_council_house").fill_null("No"),
)
.with_columns(
pl.when(pl.col("_direct_construction_age_band").is_not_null())
.then(pl.lit(1, dtype=pl.UInt8))
.otherwise(pl.lit(None, dtype=pl.UInt8))
.alias("_direct_is_construction_date_approximate")
)
.with_row_index("_direct_epc_row")
.select(
"_direct_epc_row",
"_direct_epc_match_address",
"_direct_epc_match_postcode",
"_direct_epc_outcode",
"_direct_epc_canonical_property_type",
"_direct_epc_uprn",
"_direct_epc_address",
"_direct_current_energy_rating",
"_direct_potential_energy_rating",
"_direct_total_floor_area",
"_direct_number_habitable_rooms",
"_direct_floor_height",
"_direct_construction_age_band",
"_direct_is_construction_date_approximate",
"_direct_was_council_house",
)
.collect(engine="streaming")
)
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
"""Add the normalised address/postcode/outcode keys used to match listings.
Listings are matched to EPC certificates and properties by UPRN and by
fuzzy street address within their (now accurate, detail-page-sourced)
postcode — never by coordinate proximity — so no projected easting/northing
is computed here. `_listing_uprn` flows through from the loaded listings.
"""
return listings.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
).with_columns(
pl.col("_listing_match_postcode")
.str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1)
.alias("_listing_outcode")
)
def _optional_lazy_col(schema: pl.Schema, column: str, dtype: pl.DataType) -> pl.Expr:
if column in schema:
return pl.col(column).cast(dtype, strict=False).alias(column)
return pl.lit(None, dtype=dtype).alias(column)
def _listing_property_match_schema() -> dict[str, pl.DataType]:
return {
"_listing_idx": pl.UInt32,
"_matched_postcode": pl.Utf8,
"_matched_pp_address": pl.Utf8,
"_property_match_score": pl.Float32,
"_property_match_method": pl.Utf8,
"_property_match_field": pl.Utf8,
}
def _empty_listing_property_matches() -> pl.DataFrame:
return pl.DataFrame(schema=_listing_property_match_schema())
def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
schema = wide.collect_schema()
return (
wide.select(
pl.col("postcode").cast(pl.Utf8).alias("postcode"),
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
_optional_lazy_col(schema, "epc_address", pl.Utf8),
# UPRN keys the exact match; present once epc_pp is rebuilt with it.
_optional_lazy_col(schema, "uprn", pl.Utf8),
)
.with_row_index("_property_row")
.with_columns(
normalize_postcode_key(pl.col("postcode")).alias(
"_property_match_postcode"
),
normalize_address_key(pl.col("pp_address")).alias(
"_property_match_address"
),
normalize_address_key(pl.col("epc_address")).alias(
"_property_epc_match_address"
),
)
.filter(pl.col("pp_address").is_not_null())
.filter(pl.col("_property_match_postcode").is_not_null())
.filter(
pl.col("_property_match_address").is_not_null()
| pl.col("_property_epc_match_address").is_not_null()
)
.collect(engine="streaming")
)
def _index_candidates(
candidates: pl.DataFrame, postcode_key: str, uprn_key: str
) -> tuple[dict[str, list[dict]], dict[str, dict]]:
"""Index candidate rows for matching, in a single pass over the frame.
Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
fuzzy street-address match; the UPRN index drives the exact match and is
postcode-independent, so it still resolves when a listing's postcode is
slightly off.
"""
buckets: dict[str, list[dict]] = {}
uprn_index: dict[str, dict] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get(postcode_key)
if postcode:
buckets.setdefault(postcode, []).append(row)
uprn = _normalize_uprn(row.get(uprn_key))
if uprn and uprn not in uprn_index:
uprn_index[uprn] = row
return buckets, uprn_index
def _best_listing_property_candidate(
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
) -> dict | None:
result = _best_listing_match(
listing.get("_listing_uprn"),
listing.get("_listing_match_address"),
uprn_index,
candidates,
["_property_match_address", "_property_epc_match_address"],
)
if result is None:
return None
candidate, score, method, field = result
matched_field = {
"_property_match_address": "pp_address",
"_property_epc_match_address": "epc_address",
}.get(field, method)
return {
"_listing_idx": listing["_listing_idx"],
"_matched_postcode": candidate.get("postcode"),
"_matched_pp_address": candidate.get("pp_address"),
"_property_match_score": round(score, 1),
"_property_match_method": method,
"_property_match_field": matched_field,
}
def _match_listing_properties(
listing_matches: pl.DataFrame, property_candidates: pl.DataFrame
) -> pl.DataFrame:
if listing_matches.is_empty() or property_candidates.is_empty():
return _empty_listing_property_matches()
buckets, uprn_index = _index_candidates(
property_candidates, "_property_match_postcode", "uprn"
)
best_matches = []
for listing in listing_matches.iter_rows(named=True):
postcode = listing.get("_listing_match_postcode")
bucket = buckets.get(postcode, []) if postcode else []
match = _best_listing_property_candidate(listing, uprn_index, bucket)
if match is not None:
best_matches.append(match)
if not best_matches:
return _empty_listing_property_matches()
# When two listings claim the same property, keep the most authoritative
# match: an exact UPRN match always wins over a fuzzy address match (both can
# score 100, so method must break the tie before score and listing index).
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
return (
matches.sort(
[
pl.col("_property_match_method") == "uprn",
"_property_match_score",
"_listing_idx",
],
descending=[True, True, False],
)
.unique(
["_matched_postcode", "_matched_pp_address"],
keep="first",
maintain_order=True,
)
.sort("_listing_idx")
)
def _index_epc_streets(
epc_candidates: pl.DataFrame,
) -> tuple[dict[str, dict[str, list[dict]]], dict[str, set[str]]]:
"""Index EPC candidate rows for the street-level fallback.
Returns ``(streets, noise_tokens)``: ``streets`` maps outcode -> street key
-> rows (street key = the digit-stripped match address); ``noise_tokens``
maps outcode -> the tokens appearing in at least a quarter of that
outcode's street keys. Those are locality suffixes (LONDON, SURREY, the
town name) rather than street names, and a fallback match must be anchored
by at least one token that is NOT one of them — otherwise a town-only
listing address ("COULSDON SURREY") token_set-inflates to 100 against any
street key carrying the same locality suffix and matches an arbitrary
street in the outcode.
"""
streets: dict[str, dict[str, list[dict]]] = {}
for row in epc_candidates.iter_rows(named=True):
outcode = row.get("_direct_epc_outcode")
address = row.get("_direct_epc_match_address")
if not outcode or not address:
continue
street = _street_only_address(address)
if not street:
continue
streets.setdefault(outcode, {}).setdefault(street, []).append(row)
noise_tokens: dict[str, set[str]] = {}
for outcode, by_street in streets.items():
cutoff = max(2, len(by_street) // 4)
counts: dict[str, int] = {}
for street in by_street:
for token in set(street.split()):
counts[token] = counts.get(token, 0) + 1
noise_tokens[outcode] = {
token for token, count in counts.items() if count >= cutoff
}
return streets, noise_tokens
def _best_direct_epc_candidate(
listing: dict,
uprn_index: dict[str, dict],
candidates: list[dict],
outcode_streets: dict[str, list[dict]] | None,
outcode_noise_tokens: set[str],
street_score_cache: dict[tuple[str, str], list[tuple[int, str]]],
) -> dict | None:
result = _best_listing_match(
listing.get("_listing_uprn"),
listing.get("_listing_match_address"),
uprn_index,
candidates,
["_direct_epc_match_address"],
)
if result is None:
result = _best_street_epc_fallback(
listing, outcode_streets, outcode_noise_tokens, street_score_cache
)
if result is None:
return None
candidate, score, method, _field = result
return {
"_listing_idx": listing["_listing_idx"],
"_direct_epc_address": candidate.get("_direct_epc_address"),
"_direct_current_energy_rating": candidate.get("_direct_current_energy_rating"),
"_direct_potential_energy_rating": candidate.get(
"_direct_potential_energy_rating"
),
"_direct_total_floor_area": candidate.get("_direct_total_floor_area"),
"_direct_number_habitable_rooms": candidate.get(
"_direct_number_habitable_rooms"
),
"_direct_floor_height": candidate.get("_direct_floor_height"),
"_direct_construction_age_band": candidate.get("_direct_construction_age_band"),
"_direct_is_construction_date_approximate": candidate.get(
"_direct_is_construction_date_approximate"
),
"_direct_was_council_house": candidate.get("_direct_was_council_house"),
"_direct_epc_match_status": "matched",
"_direct_epc_match_score": round(score, 1),
"_direct_epc_match_method": method,
}
def _match_direct_epc(
listing_matches: pl.DataFrame, epc_candidates: pl.DataFrame
) -> pl.DataFrame:
if listing_matches.is_empty() or epc_candidates.is_empty():
return _empty_direct_epc_matches()
buckets, uprn_index = _index_candidates(
epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
)
street_index, noise_tokens = _index_epc_streets(epc_candidates)
street_score_cache: dict[tuple[str, str], list[tuple[int, str]]] = {}
matches = []
for listing in listing_matches.iter_rows(named=True):
postcode = listing.get("_listing_match_postcode")
bucket = buckets.get(postcode, []) if postcode else []
outcode = listing.get("_listing_outcode") or _outcode_of(postcode)
match = _best_direct_epc_candidate(
listing,
uprn_index,
bucket,
street_index.get(outcode) if outcode else None,
noise_tokens.get(outcode, set()) if outcode else set(),
street_score_cache,
)
if match is not None:
matches.append(match)
if not matches:
return _empty_direct_epc_matches()
return pl.DataFrame(matches, schema=_direct_epc_match_schema())
def _enrich_listings_with_direct_epc(
listings: pl.DataFrame,
epc_path: Path | None,
) -> pl.DataFrame:
if epc_path is None:
return _ensure_direct_epc_columns(listings)
listing_matches = _listing_match_frame(listings)
listing_outcodes = (
listing_matches.select("_listing_outcode")
.drop_nulls()
.unique()
.to_series()
.to_list()
)
if not listing_outcodes:
return _ensure_direct_epc_columns(listings)
with tempfile.TemporaryDirectory(
prefix="direct_listing_epc_", dir=local_tmp_dir()
) as tmpdir:
epc_candidates = _load_direct_epc_candidates(
epc_path, listing_outcodes, Path(tmpdir)
)
print(f"Direct listing EPC candidates: {epc_candidates.height}")
direct_matches = _match_direct_epc(listing_matches, epc_candidates)
print(f"Direct listing EPC matches: {direct_matches.height}")
if direct_matches.is_empty():
return _ensure_direct_epc_columns(listings)
return _ensure_direct_epc_columns(
listings.join(direct_matches, on="_listing_idx", how="left")
)
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
# The raw property-level value is fill_null("No") upstream, so a plain
# coalesce lets a non-null "No" override a directly-matched listing
# "Yes". "Former council house" should fire if EITHER side says so.
if raw_column == "was_council_house":
return (
pl.when(
(pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes")
)
.then(pl.lit("Yes"))
.otherwise(coalesce)
.alias(raw_column)
)
return coalesce.alias(raw_column)
return wide.with_columns(
[
_coalesced(raw_column, direct_column)
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
]
)
def _build_unmatched_listing_seed_rows(
unmatched_listing_idxs: pl.DataFrame,
listings: pl.DataFrame,
template_schema: pl.Schema,
) -> pl.DataFrame:
"""Materialise wide-shape rows for listings that didn't match any property.
Each seed row carries enough columns (postcode, pp_address, property type,
tenure, floor area, room count, asking price → latest_price) for the
postcode-keyed joins later in `_build` to fill in the rest. All other
wide columns are null on the seed row.
"""
if unmatched_listing_idxs.is_empty():
return pl.DataFrame(schema=template_schema)
listings = _ensure_direct_epc_columns(listings)
base = unmatched_listing_idxs.join(listings, on="_listing_idx", how="inner")
populated: dict[str, pl.Expr] = {
"postcode": pl.col("postcode"),
"pp_address": pl.col("pp_address"),
"pp_property_type": pl.col("_actual_property_type"),
"duration": pl.col("_actual_leasehold_freehold"),
"total_floor_area": pl.coalesce(
pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
),
# Prefer the direct-EPC habitable-room count over the listing's value:
# the scraped room count is bedrooms + bathrooms (upstream storage.py
# defect), so it over-counts. Fall back to the listing value only when
# the direct-EPC match has no count.
"number_habitable_rooms": pl.coalesce(
pl.col("_direct_number_habitable_rooms"),
pl.col("_actual_number_habitable_rooms"),
),
"latest_price": pl.col("_actual_asking_price"),
}
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items():
if raw_column in populated:
continue
populated[raw_column] = pl.col(direct_column)
for _src, dst, _dt in _LISTING_OVERLAY_SOURCES:
populated[dst] = pl.col(dst)
seed_exprs: list[pl.Expr] = []
for col_name, dtype in template_schema.items():
if col_name in populated:
seed_exprs.append(
populated[col_name].cast(dtype, strict=False).alias(col_name)
)
else:
seed_exprs.append(pl.lit(None, dtype=dtype).alias(col_name))
return base.select(seed_exprs)
def _integrate_listings(
wide: pl.LazyFrame,
listings_path: Path,
arcgis_path: Path,
epc_path: Path | None = None,
) -> pl.LazyFrame:
"""Splice actual listings into the wide property frame.
Listings are fuzzy-matched to wide rows on (postcode, pp_address). Matched
listings stamp `_actual_*` overlay columns onto the existing wide row, so
historical context (EPC, last sale, etc.) is preserved. Unmatched listings
are appended as new wide rows with enough property-shape fields filled in
that the downstream postcode-keyed joins (deprivation, crime, tree
density, …) populate them just like any other row.
"""
listings = _load_listings_for_merge(listings_path, arcgis_path)
print(f"Listings loaded: {listings.height}")
listings = _enrich_listings_with_direct_epc(listings, epc_path)
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
listing_attachment_columns = [
*overlay_columns,
*[column for column, _dtype in _DIRECT_EPC_COLUMNS],
]
property_candidates = _property_match_candidate_frame(wide)
joined = _match_listing_properties(
_listing_match_frame(listings), property_candidates
)
total = listings.height
matched_count = joined.height
if total > 0:
print(
"Listings matched to existing wide rows: "
f"{matched_count}/{total} "
f"({100 * matched_count / total:.1f}%)"
)
overlay_for_matched = (
joined.join(listings, on="_listing_idx", how="inner")
.select(
pl.col("_matched_postcode").alias("postcode"),
pl.col("_matched_pp_address").alias("pp_address"),
*listing_attachment_columns,
)
.unique(["postcode", "pp_address"], keep="first")
)
wide_attached = wide.join(
overlay_for_matched.lazy(), on=["postcode", "pp_address"], how="left"
)
wide_attached = _coalesce_direct_epc_columns(wide_attached)
wide_output = wide_attached.drop(
[column for column, _dtype in _DIRECT_EPC_COLUMNS], strict=False
)
unmatched_listing_idxs = listings.select("_listing_idx").join(
joined.select("_listing_idx"), on="_listing_idx", how="anti"
)
seed_rows = _build_unmatched_listing_seed_rows(
unmatched_listing_idxs,
listings,
template_schema=wide_output.collect_schema(),
)
return pl.concat([wide_output, seed_rows.lazy()], how="vertical_relaxed")
def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
"""Project the post-rename wide frame down to enriched-listing rows."""
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
# A matched listing's overlay attaches to every wide row sharing its
# (postcode, pp_address). The terminated-postcode remap can collapse several
# distinct wide rows onto one such key, which would otherwise emit one duplicate
# listing per collapsed row. Each listing matches exactly one (postcode,
# pp_address) and each seed row carries a unique URL, so keeping a single row per
# listing URL collapses only that fan-out and never merges distinct listings.
df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
df = df.with_columns(
pl.col("_actual_listing_url").alias("Listing URL"),
pl.col("_actual_listing_date").alias("Listing date"),
pl.col("_actual_listing_status").alias("Listing status"),
pl.col("_actual_listing_features").alias("Listing features"),
pl.col("_actual_asking_price").alias("Asking price"),
pl.col("_actual_asking_price_per_sqm").alias("Asking price per sqm"),
pl.col("_actual_bedrooms").alias("Bedrooms"),
pl.col("_actual_bathrooms").alias("Bathrooms"),
pl.col("_actual_price_qualifier").alias("Price qualifier"),
pl.col("_actual_property_sub_type").alias("Property sub-type"),
# Listing coordinates win over the postcode centroid.
pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
# Listing's floor area overrides any EPC/PP value when present.
pl.coalesce(
pl.col("_actual_total_floor_area").cast(pl.Float64),
pl.col("Total floor area (sqm)"),
).alias("Total floor area (sqm)"),
# Rooms: prefer the EPC habitable-room count and fall back to the listing
# value only when no EPC count exists. The scraped "Number of bedrooms &
# living rooms" is actually bedrooms + bathrooms (an upstream storage.py
# defect), so preferring it would inflate the room count and overwrite a
# correct EPC value.
pl.coalesce(
pl.col("Number of bedrooms & living rooms"),
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
).alias("Number of bedrooms & living rooms"),
pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
.then(pl.col("_actual_property_type"))
.otherwise(pl.col("Property type"))
.alias("Property type"),
pl.when(pl.col("_actual_leasehold_freehold").is_in(_TENURE_VALUES))
.then(pl.col("_actual_leasehold_freehold"))
.otherwise(pl.col("Leasehold/Freehold"))
.alias("Leasehold/Freehold"),
)
df = df.with_columns(
pl.coalesce(
pl.col("Asking price per sqm"),
pl.when(
pl.col("Asking price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > MIN_FLOOR_AREA_M2)
)
.then(
(
pl.col("Asking price").cast(pl.Float64)
/ pl.col("Total floor area (sqm)")
)
.round(0)
.cast(pl.Int32, strict=False)
)
.otherwise(None),
).alias("Asking price per sqm")
)
df = df.with_columns(
pl.col("Asking price").alias("Estimated current price"),
pl.col("Asking price per sqm").alias("Est. price per sqm"),
pl.coalesce(pl.col("Last known price"), pl.col("Asking price")).alias(
"Last known price"
),
pl.when(pl.col("Date of last transaction").is_not_null())
.then(pl.lit("matched"))
.otherwise(pl.lit("unmatched"))
.alias("Historical property match status"),
)
drop_cols = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
return df.drop(drop_cols, strict=False)
@dataclass
class _BuildResult:
"""Outputs of `_build` — exactly one of the two slot pairs is populated."""
postcode: pl.DataFrame | None = None
properties: pl.DataFrame | None = None
listings: pl.DataFrame | None = None
def _build(
epc_pp_path: Path,
arcgis_path: Path,
iod_path: Path,
poi_proximity_path: Path,
ethnicity_path: Path,
crime_path: Path,
noise_path: Path,
school_catchments_path: Path,
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_postcodes_path: Path | None = None,
listed_buildings_path: Path | None = None,
actual_listings_path: Path | None = None,
actual_listings_epc_path: Path | None = None,
mode: Literal["normal", "listings"] = "normal",
) -> _BuildResult:
"""Build postcode/properties dataframes (or enriched listings) from epc_pp + auxiliary data.
Modes:
* `normal` — produces (postcode_df, properties_df) as before. Ignores
`actual_listings_path` if supplied.
* `listings` — requires `actual_listings_path`; produces a single
enriched-listings DataFrame and skips the postcode/properties outputs.
Listings flow through the same enrichment joins as historical rows,
so postcode-scoped features (tree density, crime, deprivation, …) end
up populated on every listing with a valid postcode.
"""
if mode == "listings" and actual_listings_path is None:
raise ValueError("listings mode requires actual_listings_path")
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
_validate_lad_source_coverage(iod_path, rental_prices_path)
wide = pl.scan_parquet(epc_pp_path).filter(
pl.col("total_floor_area").is_null()
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
)
# Remap terminated postcodes to nearest active successor before filtering to
# the supported active-English postcode universe. Historical properties from
# terminated English postcodes are retained under their successor postcode.
postcode_mapping = build_postcode_mapping(arcgis_path)
wide = _remap_terminated_postcodes(wide, postcode_mapping.lazy())
# The remap can collapse two terminated postcodes onto one active successor,
# duplicating a physical address's (postcode, pp_address) key; keep only the
# most-recent transaction per address before the per-postcode joins.
wide = _dedupe_collapsed_properties(wide)
arcgis_raw = pl.scan_parquet(arcgis_path)
arcgis = _active_english_postcode_area(arcgis_raw)
active_postcodes = arcgis.select("postcode").unique()
active_postcode_count = (
active_postcodes.select(pl.len()).collect(engine="streaming").item()
)
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
if listed_buildings_path is not None:
active_postcodes_for_listed = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
"east1m",
"north1m",
)
.collect(engine="streaming")
)
listed_flags = _listed_building_flags(
wide.select("postcode", "pp_address", "epc_address"),
active_postcodes_for_listed,
listed_buildings_path,
)
wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
else:
wide = wide.with_columns(
pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
)
if actual_listings_path is not None:
wide = _integrate_listings(
wide,
actual_listings_path,
arcgis_path,
epc_path=actual_listings_epc_path,
)
wide = _filter_to_active_english_postcodes(wide, active_postcodes)
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
# `_active_english_postcode_area` aliases them back to the short canonical
# names used across the pipeline so downstream joins don't need to know
# about NSPL's versioning scheme.
wide = wide.join(arcgis, on="postcode", how="left")
postcode_area = arcgis
iod = pl.scan_parquet(iod_path).with_columns(
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
)
ethnicity = pl.scan_parquet(ethnicity_path)
crime = pl.scan_parquet(crime_path)
median_age = pl.scan_parquet(median_age_path)
election = pl.scan_parquet(election_results_path)
poi_counts = pl.scan_parquet(poi_proximity_path)
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
noise = (
pl.scan_parquet(noise_path)
.with_columns(
# NaN → null so max_horizontal ignores missing instead of propagating NaN
*[pl.col(c).fill_nan(None) for c in noise_cols],
)
.with_columns(
pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
)
.select("postcode", "noise_lden_db")
)
school_catchments = pl.scan_parquet(school_catchments_path)
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
tree_density = None
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
# Broadband: derive max available download speed tier per postcode from
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as a numeric (UInt16) Mbps value so
# it sorts/filters correctly; null (not a fabricated 10) when no availability
# tier is present, so "no data" is distinguishable from a genuine 10 Mbps.
broadband = (
pl.scan_parquet(broadband_path)
.select(
pl.col("postcode_space").alias("bb_postcode"),
pl.when(pl.col("Gigabit availability (% premises)") > 0)
.then(1000)
.when(pl.col("UFBB availability (% premises)") > 0)
.then(300)
.when(pl.col("UFBB (100Mbit/s) availability (% premises)") > 0)
.then(100)
.when(pl.col("SFBB availability (% premises)") > 0)
.then(30)
.otherwise(None)
.cast(pl.UInt16)
.alias("max_download_speed"),
)
.group_by("bb_postcode")
.agg(pl.col("max_download_speed").max())
)
area_side_tables = {
"iod": iod,
"ethnicity": ethnicity,
"crime": crime,
"median_age": median_age,
"election": election,
"poi_counts": poi_counts,
"noise": noise,
"school_catchments": school_catchments,
"conservation_areas": conservation_areas,
"tree_density": tree_density,
"broadband": broadband,
}
wide = _join_area_side_tables(wide, **area_side_tables)
postcode_area = _join_area_side_tables(postcode_area, **area_side_tables)
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
wide = wide.with_columns(
(pl.col("number_habitable_rooms") - 1)
.clip(0, 4)
.cast(pl.UInt8)
.alias("_bedrooms"),
)
rental = pl.scan_parquet(rental_prices_path).select(
"area_code", "bedrooms", "mean_monthly_rent"
)
wide = wide.join(
rental,
left_on=["Local Authority District code (2024)", "_bedrooms"],
right_on=["area_code", "bedrooms"],
how="left",
)
# Derive property_type: prefer EPC data, fall back to price-paid.
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
wide = wide.with_columns(
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc & is_house)
.then(pl.col("pp_property_type"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(pl.col("pp_property_type"))
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
# collapse terrace sub-types, and fold rare types into "Other"
.replace(
{
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
}
)
.alias("property_type")
)
wide = wide.with_columns(
pl.when(pl.col("duration") == "U")
.then(None)
.otherwise(pl.col("duration"))
.alias("duration"),
pl.when(pl.col("current_energy_rating") == "INVALID!")
.then(None)
.otherwise(pl.col("current_energy_rating"))
.alias("current_energy_rating"),
).with_columns(
# Null out implausible per-sqm values (outside the kNN comparable band):
# bulk/block transactions divided by a single unit's floor area otherwise
# produce figures up to ~£1.5M/sqm.
pl.when(
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
& (
(pl.col("latest_price") / pl.col("total_floor_area")).is_between(
MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM
)
)
)
.then(
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
)
.otherwise(None)
.alias("Price per sqm"),
)
wide = _finalize_merged_columns(wide)
postcode_area = _finalize_merged_columns(postcode_area)
print("Collecting with streaming engine...")
if mode == "listings":
df = wide.collect(engine="streaming")
enriched_listings = _finalize_listings(df)
_validate_property_postcodes(enriched_listings)
print(f"Enriched listings rows: {enriched_listings.height}")
return _BuildResult(listings=enriched_listings)
df, postcode_features = pl.collect_all([wide, postcode_area], engine="streaming")
_validate_property_postcodes(df)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=active_postcode_count
)
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
print(f"Property rows: {properties_df.height}")
return _BuildResult(postcode=postcode_df, properties=properties_df)
def main():
parser = argparse.ArgumentParser(
description="Build wide property dataframe with all joins"
)
parser.add_argument(
"--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
)
parser.add_argument(
"--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
)
parser.add_argument(
"--iod",
type=Path,
required=True,
help="Index of Deprivation parquet file (optional)",
)
parser.add_argument(
"--poi-proximity",
type=Path,
help="POI proximity counts parquet file (optional)",
)
parser.add_argument(
"--ethnicity",
type=Path,
required=True,
help="Census 2021 ethnic group (TS021) by LSOA parquet file",
)
parser.add_argument(
"--crime",
type=Path,
required=True,
help="Crime by LSOA parquet file (optional)",
)
parser.add_argument(
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
)
parser.add_argument(
"--school-catchments",
type=Path,
required=True,
help="School catchment counts parquet file",
)
parser.add_argument(
"--broadband",
type=Path,
required=True,
help="Broadband performance by output area parquet file",
)
parser.add_argument(
"--conservation-areas",
type=Path,
required=True,
help="Planning Data conservation areas GeoJSON",
)
parser.add_argument(
"--listed-buildings",
type=Path,
required=False,
help="Historic England NHLE listed-building points GeoPackage",
)
parser.add_argument(
"--rental-prices",
type=Path,
required=True,
help="ONS rental prices by LA and bedroom count parquet file",
)
parser.add_argument(
"--median-age",
type=Path,
required=True,
help="Census 2021 median age by LSOA parquet file",
)
parser.add_argument(
"--election-results",
type=Path,
required=True,
help="2024 General Election results by constituency parquet file",
)
parser.add_argument(
"--tree-density-postcodes",
type=Path,
required=False,
help="Postcode-level tree density parquet from pipeline.transform.tree_density",
)
parser.add_argument(
"--output-postcodes",
type=Path,
required=False,
help="Output postcode parquet (normal mode only)",
)
parser.add_argument(
"--output-properties",
type=Path,
required=False,
help="Output properties parquet (normal mode only)",
)
parser.add_argument(
"--actual-listings",
type=Path,
required=False,
help=(
"Optional scraped-listings parquet. When provided, listings flow "
"through the same merge pipeline as historical properties — set "
"--output-listings to write the enriched-listings file instead "
"of the postcode/properties files."
),
)
parser.add_argument(
"--epc",
type=Path,
required=False,
help=(
"Raw EPC certificates CSV or zip. Used only with --actual-listings "
"to match live listings directly to EPC records."
),
)
parser.add_argument(
"--output-listings",
type=Path,
required=False,
help=(
"Output enriched-listings parquet path. Required (and only valid) "
"when --actual-listings is set; --output-postcodes and "
"--output-properties are ignored in this mode."
),
)
args = parser.parse_args()
listings_mode = args.actual_listings is not None
if listings_mode and args.output_listings is None:
parser.error("--output-listings is required when --actual-listings is set")
if not listings_mode and (
args.output_postcodes is None or args.output_properties is None
):
parser.error(
"--output-postcodes and --output-properties are required in normal mode"
)
result = _build(
epc_pp_path=args.epc_pp,
arcgis_path=args.arcgis,
iod_path=args.iod,
poi_proximity_path=args.poi_proximity,
ethnicity_path=args.ethnicity,
crime_path=args.crime,
noise_path=args.noise,
school_catchments_path=args.school_catchments,
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_postcodes_path=args.tree_density_postcodes,
listed_buildings_path=args.listed_buildings,
actual_listings_path=args.actual_listings,
actual_listings_epc_path=args.epc if listings_mode else None,
mode="listings" if listings_mode else "normal",
)
if listings_mode:
listings_df = result.listings
assert listings_df is not None # guaranteed by mode contract
args.output_listings.parent.mkdir(parents=True, exist_ok=True)
listings_df.write_parquet(args.output_listings)
size_mb = args.output_listings.stat().st_size / (1024 * 1024)
print(
f"\nEnriched listings: {listings_df.height} rows, "
f"{len(listings_df.columns)} columns"
)
print(f"Wrote {args.output_listings} ({size_mb:.1f} MB)")
return
postcode_df = result.postcode
properties_df = result.properties
assert postcode_df is not None and properties_df is not None
print(f"\nPostcode columns: {postcode_df.columns}")
print(f"Postcode rows: {postcode_df.height}")
postcode_df.write_parquet(args.output_postcodes)
size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")
print(f"\nProperty columns: {properties_df.columns}")
print(f"Property rows: {properties_df.height}")
properties_df.write_parquet(args.output_properties)
size_mb = args.output_properties.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")
if __name__ == "__main__":
main()