has issues

This commit is contained in:
Andras Schmelczer 2026-05-25 13:20:17 +01:00
parent 2e112d7398
commit c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions

View file

@ -1,12 +1,27 @@
import argparse
import re
import numpy as np
import polars as pl
from pathlib import Path
import pyogrio
from pyproj import Transformer
from scipy.spatial import cKDTree
from shapely import from_wkb, points
from shapely.geometry.base import BaseGeometry
from shapely.strtree import STRtree
from thefuzz import fuzz
from pipeline.utils.fuzzy_join import normalize_address_key
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
LISTED_BUILDING_MIN_MATCH_SCORE = 95
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
@ -24,6 +39,8 @@ _AREA_COLUMNS = [
"lon",
# Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd",
# Keyed lookup for postcode-level side tables (e.g. crime time series).
"lsoa21",
# Deprivation
"Income Score",
"Employment Score",
@ -63,6 +80,7 @@ _AREA_COLUMNS = [
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
CONSERVATION_AREA_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
@ -97,6 +115,20 @@ _RENT_SOURCE_UNAVAILABLE_LADS = {
"E06000053": "Isles of Scilly",
"E09000001": "City of London",
}
_NUMBER_RE = re.compile(r"\d+")
_LISTED_NAME_STOP_WORDS = {
"A",
"AN",
"AND",
"AT",
"BY",
"IN",
"OF",
"ON",
"THE",
"TO",
"WITH",
}
def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -105,6 +137,389 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
)
def _numbers_compatible(left: str, right: str) -> bool:
"""Require address/list-entry numbers to agree when either side has numbers."""
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
(left_nums, right_nums)
if len(left_nums) <= len(right_nums)
else (right_nums, left_nums)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _listed_candidate_schema() -> dict[str, pl.DataType]:
return {
"postcode": pl.Utf8,
"_listed_match_name": pl.Utf8,
"_listed_grade": pl.Utf8,
"_listed_entry": pl.Int64,
}
def _empty_listed_candidates() -> pl.DataFrame:
return pl.DataFrame(schema=_listed_candidate_schema())
def _empty_listed_property_flags() -> pl.DataFrame:
return pl.DataFrame(
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
LISTED_BUILDING_FEATURE: pl.Utf8,
}
)
def _is_matchable_listed_name(name_key: str | None) -> bool:
if not name_key:
return False
if _NUMBER_RE.search(name_key):
return True
substantive_tokens = [
token
for token in name_key.split()
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
]
return len(substantive_tokens) >= 2
def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
"""Load Historic England NHLE listed-building point attributes."""
columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
info = pyogrio.read_info(listed_buildings_path)
geometry_type = str(info.get("geometry_type") or "")
if "Point" not in geometry_type:
raise ValueError(
f"Expected listed-building point data, got geometry {geometry_type!r}"
)
_, table = pyogrio.read_arrow(
listed_buildings_path,
columns=columns,
read_geometry=False,
)
df = pl.from_arrow(table)
missing = sorted(set(columns) - set(df.columns))
if missing:
raise ValueError(
f"{listed_buildings_path} is missing listed-building columns: {missing}"
)
return (
df.select(
pl.col("ListEntry").cast(pl.Int64),
pl.col("Name").cast(pl.Utf8),
pl.col("Grade").cast(pl.Utf8),
pl.col("Easting").cast(pl.Float64),
pl.col("Northing").cast(pl.Float64),
)
.drop_nulls(["Name", "Easting", "Northing"])
.with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
.filter(pl.col("_listed_match_name").is_not_null())
)
def _postcode_listed_building_candidates(
listed_points: pl.DataFrame,
active_postcodes: pl.DataFrame,
*,
nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
) -> pl.DataFrame:
"""Assign each listed-building point to nearby active postcode candidates."""
if listed_points.is_empty() or active_postcodes.is_empty():
return _empty_listed_candidates()
required_postcode_cols = {"postcode", "east1m", "north1m"}
missing = sorted(required_postcode_cols - set(active_postcodes.columns))
if missing:
raise ValueError(f"Active postcode data missing required columns: {missing}")
required_listed_cols = {
"_listed_match_name",
"Grade",
"ListEntry",
"Easting",
"Northing",
}
missing = sorted(required_listed_cols - set(listed_points.columns))
if missing:
raise ValueError(f"Listed-building data missing required columns: {missing}")
postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
postcodes = postcodes.filter(
pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
)
listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
listed = listed.filter(
pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
)
if postcodes.is_empty() or listed.is_empty():
return _empty_listed_candidates()
postcode_coords = np.column_stack(
[postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
)
listed_coords = np.column_stack(
[listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
)
k = max(1, min(nearest_postcodes, postcodes.height))
distances, indices = cKDTree(postcode_coords).query(
listed_coords,
k=k,
distance_upper_bound=max_distance_m,
)
if k == 1:
distances = distances[:, np.newaxis]
indices = indices[:, np.newaxis]
postcode_values = postcodes["postcode"].to_list()
listed_names = listed["_listed_match_name"].to_list()
listed_grades = listed["Grade"].to_list()
listed_entries = listed["ListEntry"].to_list()
rows: list[tuple[str, str, str | None, int | None]] = []
for listed_idx in range(listed.height):
name_key = listed_names[listed_idx]
if not _is_matchable_listed_name(name_key):
continue
seen_postcodes: set[str] = set()
for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
if not np.isfinite(distance) or postcode_idx >= postcodes.height:
continue
postcode = postcode_values[int(postcode_idx)]
if postcode in seen_postcodes:
continue
seen_postcodes.add(postcode)
rows.append(
(
postcode,
name_key,
listed_grades[listed_idx],
listed_entries[listed_idx],
)
)
if not rows:
return _empty_listed_candidates()
return (
pl.DataFrame(
rows,
schema=[
"postcode",
"_listed_match_name",
"_listed_grade",
"_listed_entry",
],
orient="row",
)
.cast(_listed_candidate_schema())
.unique(["postcode", "_listed_match_name", "_listed_entry"])
)
def _matched_listed_building_flags(
properties: pl.LazyFrame,
listed_candidates: pl.DataFrame,
*,
min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
) -> pl.DataFrame:
"""Return property keys that conservatively match an NHLE listed entry."""
if listed_candidates.is_empty():
return _empty_listed_property_flags()
candidate_postcodes = listed_candidates.select("postcode").unique()
property_candidates = (
properties.select("postcode", "pp_address", "epc_address")
.join(candidate_postcodes.lazy(), on="postcode", how="semi")
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
)
.filter(
pl.col("pp_address").is_not_null()
& (
pl.col("_pp_match_address").is_not_null()
| pl.col("_epc_match_address").is_not_null()
)
)
.collect(engine="streaming")
)
if property_candidates.is_empty():
return _empty_listed_property_flags()
listed_by_postcode: dict[str, list[str]] = {}
for postcode, name in listed_candidates.select(
"postcode", "_listed_match_name"
).iter_rows():
if postcode and name:
listed_by_postcode.setdefault(postcode, []).append(name)
matches: list[tuple[str, str, str]] = []
for row in property_candidates.iter_rows(named=True):
postcode = row["postcode"]
listed_names = listed_by_postcode.get(postcode)
if not listed_names:
continue
address_keys = []
for col in ("_pp_match_address", "_epc_match_address"):
value = row.get(col)
if value and value not in address_keys:
address_keys.append(value)
matched = False
for address_key in address_keys:
for listed_name in listed_names:
if not _numbers_compatible(address_key, listed_name):
continue
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
matched = True
break
if matched:
break
if matched:
matches.append((postcode, row["pp_address"], "Yes"))
if not matches:
return _empty_listed_property_flags()
return (
pl.DataFrame(
matches,
schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
orient="row",
)
.cast(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
LISTED_BUILDING_FEATURE: pl.Utf8,
}
)
.unique(["postcode", "pp_address"])
)
def _listed_building_flags(
properties: pl.LazyFrame,
active_postcodes: pl.DataFrame,
listed_buildings_path: Path,
) -> pl.DataFrame:
print(f"Loading listed-building points from {listed_buildings_path}...")
listed_points = _load_listed_building_points(listed_buildings_path)
print(f"Loaded {listed_points.height} listed-building point records")
listed_candidates = _postcode_listed_building_candidates(
listed_points, active_postcodes
)
print(
"Matching listed-building names to property addresses across "
f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
)
flags = _matched_listed_building_flags(properties, listed_candidates)
print(f"Matched {flags.height} property addresses to listed-building entries")
return flags
def _normalise_crs(crs: object | None) -> str:
return str(crs) if crs else "EPSG:4326"
def _load_conservation_area_geometries(
conservation_areas_path: Path,
) -> tuple[list[BaseGeometry], str]:
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
geometry_name = metadata.get("geometry_name") or table.column_names[-1]
geometries = []
for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
if geom is not None and not geom.is_empty:
geometries.append(geom)
if not geometries:
raise ValueError(
f"{conservation_areas_path} does not contain any usable polygon geometries"
)
return geometries, _normalise_crs(metadata.get("crs"))
def _postcode_conservation_area_flags(
postcodes: pl.DataFrame,
conservation_geometries: list[BaseGeometry],
conservation_crs: object | None,
batch_size: int = 100_000,
) -> pl.DataFrame:
required = {"postcode", "lat", "lon"}
missing = sorted(required - set(postcodes.columns))
if missing:
raise ValueError(f"Postcode data missing required columns: {missing}")
all_postcodes = postcodes.select("postcode").drop_nulls().unique()
valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
if valid_points.is_empty():
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
lat = valid_points["lat"].to_numpy()
lon = valid_points["lon"].to_numpy()
finite = np.isfinite(lat) & np.isfinite(lon)
valid_points = valid_points.filter(pl.Series(finite))
if valid_points.is_empty():
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
lat = valid_points["lat"].to_numpy()
lon = valid_points["lon"].to_numpy()
transformer = Transformer.from_crs(
"EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
)
x, y = transformer.transform(lon, lat)
tree = STRtree(conservation_geometries)
inside = np.zeros(valid_points.height, dtype=bool)
for start in range(0, valid_points.height, batch_size):
end = min(start + batch_size, valid_points.height)
point_batch = points(x[start:end], y[start:end])
matches = tree.query(point_batch, predicate="intersects")
if matches.size > 0:
inside[start + matches[0]] = True
matched = (
valid_points.select("postcode")
.with_columns(pl.Series("_within_conservation_area", inside))
.group_by("postcode")
.agg(pl.col("_within_conservation_area").max())
.with_columns(
pl.when(pl.col("_within_conservation_area"))
.then(pl.lit("Yes"))
.otherwise(pl.lit("No"))
.alias(CONSERVATION_AREA_FEATURE)
)
.select("postcode", CONSERVATION_AREA_FEATURE)
)
return (
all_postcodes.join(matched, on="postcode", how="left")
.with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
.select("postcode", CONSERVATION_AREA_FEATURE)
)
def _conservation_area_by_postcode(
postcodes: pl.LazyFrame,
conservation_areas_path: Path,
) -> pl.LazyFrame:
print(f"Loading conservation area polygons from {conservation_areas_path}...")
geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
postcode_points = postcodes.select("postcode", "lat", "lon").collect(
engine="streaming"
)
print(
"Computing conservation area membership for "
f"{postcode_points.height} active English postcodes..."
)
return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
@ -234,11 +649,13 @@ def _build(
noise_path: Path,
school_proximity_path: Path,
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_postcodes_path: Path | None = None,
listed_buildings_path: Path | None = None,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -273,6 +690,29 @@ def _build(
).unique(["postcode"])
wide = wide.join(postcode_country, on="postcode", how="left")
if listed_buildings_path is not None:
active_postcodes_for_listed = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
"east1m",
"north1m",
)
.collect(engine="streaming")
)
listed_flags = _listed_building_flags(
wide.select("postcode", "pp_address", "epc_address"),
active_postcodes_for_listed,
listed_buildings_path,
)
wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
else:
wide = wide.with_columns(
pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
)
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
arcgis = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
@ -382,6 +822,13 @@ def _build(
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
wide = wide.join(tree_density, on="postcode", how="left")
@ -476,7 +923,6 @@ def _build(
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"lsoa21",
"oa21",
"pcon",
"epc_property_type",
@ -598,6 +1044,18 @@ def main():
required=True,
help="Broadband performance by output area parquet file",
)
parser.add_argument(
"--conservation-areas",
type=Path,
required=True,
help="Historic England conservation areas GeoPackage",
)
parser.add_argument(
"--listed-buildings",
type=Path,
required=False,
help="Historic England NHLE listed-building points GeoPackage",
)
parser.add_argument(
"--rental-prices",
type=Path,
@ -652,11 +1110,13 @@ def main():
noise_path=args.noise,
school_proximity_path=args.school_proximity,
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_postcodes_path=args.tree_density_postcodes,
listed_buildings_path=args.listed_buildings,
)
print(f"\nPostcode columns: {postcode_df.columns}")