has issues

This commit is contained in:
Andras Schmelczer 2026-05-25 13:20:17 +01:00
parent 2e112d7398
commit c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions

View file

@ -7,6 +7,27 @@ import polars as pl
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
MONTH_RE = r"^\d{4}-\d{2}$"
# Crime types that roll up into "Serious crime" / "Minor crime" aggregates.
# Must match the names used in pipeline/transform/merge.py for the sum_horizontal expressions.
SERIOUS_CRIME_TYPES = (
"Violence and sexual offences",
"Robbery",
"Burglary",
"Possession of weapons",
)
MINOR_CRIME_TYPES = (
"Anti-social behaviour",
"Criminal damage and arson",
"Shoplifting",
"Bicycle theft",
"Theft from the person",
"Other theft",
"Vehicle crime",
"Public order",
"Drugs",
"Other crime",
)
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
csvs = sorted(crime_dir.rglob("*.csv"))
@ -14,7 +35,12 @@ def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
return street_csvs, len(csvs) - len(street_csvs)
def transform_crime(crime_dir: Path, output_path: Path) -> None:
def transform_crime(
crime_dir: Path,
output_path: Path,
by_year_output_path: Path | None = None,
lsoa_lookup_path: Path | None = None,
) -> None:
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
if not csvs:
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
@ -38,6 +64,8 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
},
).select("LSOA code", "Crime type", "Month")
df = _apply_lsoa_2011_to_2021(df, lsoa_lookup_path)
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
valid_months = (
df.filter(valid_month_expr)
@ -57,6 +85,9 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
)
# Count monthly incidents, then annualise over every valid month in the dataset.
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
# don't know which child a given incident actually belonged to.
yearly_counts = (
df.filter(
valid_month_expr
@ -66,7 +97,7 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
& (pl.col("Crime type") != "")
)
.group_by("LSOA code", "Month", "Crime type")
.agg(pl.len().alias("count"))
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type")
.agg(
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
@ -98,6 +129,118 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
if by_year_output_path is not None:
_write_crime_by_year(df, valid_month_expr, by_year_output_path)
def _write_crime_by_year(
df: pl.LazyFrame, valid_month_expr: pl.Expr, by_year_output_path: Path
) -> None:
"""Emit per-LSOA per-type per-year crime counts as nested list[struct] columns.
Partial years are scaled to a 12-month-equivalent count so cross-year trends
aren't distorted by months missing from the source data.
"""
filtered = df.filter(
valid_month_expr
& pl.col("LSOA code").is_not_null()
& (pl.col("LSOA code") != "")
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
# Using crime-type-specific months would over-scale years where a rare type appears
# in only some months.
months_per_year = filtered.group_by("year").agg(
pl.col("Month").n_unique().alias("months_in_year")
)
yearly_per_type = (
filtered.group_by("LSOA code", "Crime type", "year", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type", "year")
.agg(pl.col("count").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
.round(1)
.alias("count")
)
.select("LSOA code", "Crime type", "year", "count")
.collect(engine="streaming")
)
if yearly_per_type.is_empty():
raise ValueError("No valid crime rows for by-year output")
serious_rollup = _rollup_long(yearly_per_type, SERIOUS_CRIME_TYPES, "Serious crime")
minor_rollup = _rollup_long(yearly_per_type, MINOR_CRIME_TYPES, "Minor crime")
combined = pl.concat([yearly_per_type, serious_rollup, minor_rollup])
by_lsoa_type = (
combined.sort("year")
.group_by("LSOA code", "Crime type")
.agg(pl.struct("year", "count").alias("series"))
)
wide_by_year = by_lsoa_type.pivot(
on="Crime type", index="LSOA code", values="series"
)
type_cols = [c for c in wide_by_year.columns if c != "LSOA code"]
wide_by_year = wide_by_year.rename({col: f"{col} (by year)" for col in type_cols})
print(f"By-year output shape: {wide_by_year.shape}")
print(f"By-year columns: {wide_by_year.columns}")
wide_by_year.write_parquet(by_year_output_path, compression="zstd")
print(f"Saved by-year output to {by_year_output_path}")
def _rollup_long(
yearly_per_type: pl.DataFrame, types: tuple[str, ...], rollup_name: str
) -> pl.DataFrame:
"""Sum per-year counts across a set of crime types into a single rollup type."""
return (
yearly_per_type.filter(pl.col("Crime type").is_in(list(types)))
.group_by("LSOA code", "year")
.agg(pl.col("count").sum().round(1).alias("count"))
.with_columns(pl.lit(rollup_name).alias("Crime type"))
.select("LSOA code", "Crime type", "year", "count")
)
def _apply_lsoa_2011_to_2021(
df: pl.LazyFrame, lsoa_lookup_path: Path | None
) -> pl.LazyFrame:
"""Remap pre-2022 LSOA 2011 codes to LSOA 2021 codes.
Police.uk reports older years using LSOA 2011 codes; the rest of the pipeline
keys on LSOA 2021. Without remapping, those years silently fail to join and
the crime-over-time chart only shows post-2022 data.
For 1:1 mappings the LSOA code is rewritten in place. For 1N splits (one
2011 LSOA becoming several 2021 ones), each child gets an even share via
`_weight = 1/N` since the source CSVs don't tell us which child a given
incident actually fell into.
"""
if lsoa_lookup_path is None:
return df.with_columns(pl.lit(1.0).alias("_weight"))
lookup = pl.scan_parquet(lsoa_lookup_path).select("lsoa11", "lsoa21")
weighted = lookup.with_columns(
(1.0 / pl.col("lsoa21").count().over("lsoa11")).alias("_weight")
)
return (
df.join(weighted, left_on="LSOA code", right_on="lsoa11", how="left")
.with_columns(
pl.coalesce("lsoa21", "LSOA code").alias("LSOA code"),
pl.col("_weight").fill_null(1.0),
)
.drop("lsoa21")
)
def main() -> None:
parser = argparse.ArgumentParser(
@ -109,8 +252,22 @@ def main() -> None:
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
parser.add_argument(
"--output-by-year",
type=Path,
required=False,
help="Optional output parquet for per-LSOA per-year per-type counts (nested list[struct])",
)
parser.add_argument(
"--lsoa-lookup",
type=Path,
required=False,
help="Optional parquet with columns (lsoa11, lsoa21) for remapping pre-2022 codes",
)
args = parser.parse_args()
transform_crime(args.input, args.output)
transform_crime(
args.input, args.output, args.output_by_year, args.lsoa_lookup
)
if __name__ == "__main__":

View file

@ -0,0 +1,159 @@
"""Build PMTiles point tiles for the crime heatmap overlay.
The output intentionally keeps point features rather than H3/grid aggregates so
MapLibre can render a true client-side heatmap. Police.uk coordinates are
published anonymous map points, not exact offence locations.
"""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
import tempfile
from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.crime import find_street_crime_csvs
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
csvs, _ignored = find_street_crime_csvs(crime_dir)
months = sorted({path.parent.name for path in csvs})
if not months:
raise FileNotFoundError(f"No street crime CSVs found in {crime_dir}")
return months[-month_count:]
def _street_csvs_for_months(crime_dir: Path, months: set[str]) -> list[Path]:
csvs, _ignored = find_street_crime_csvs(crime_dir)
selected = [path for path in csvs if path.parent.name in months]
if not selected:
raise FileNotFoundError(f"No street crime CSVs found for {sorted(months)}")
return selected
def _require_tippecanoe() -> str:
executable = shutil.which("tippecanoe")
if executable is None:
raise RuntimeError(
"tippecanoe is required to build crime hotspot PMTiles. "
"Install tippecanoe and rerun this target."
)
return executable
def _write_geojsonseq(csvs: list[Path], output_path: Path) -> int:
df = (
pl.scan_csv(
csvs,
schema_overrides={
"Longitude": pl.Float64,
"Latitude": pl.Float64,
"Month": pl.Utf8,
"Crime type": pl.Utf8,
},
ignore_errors=True,
)
.select(
pl.col("Longitude").alias("lon"),
pl.col("Latitude").alias("lat"),
pl.col("Month").alias("month"),
pl.col("Crime type").alias("crime_type"),
)
.drop_nulls(["lon", "lat"])
.filter(pl.col("lon").is_between(-9.5, 5.0))
.filter(pl.col("lat").is_between(49.0, 57.0))
.collect(engine="streaming")
)
with output_path.open("w") as file:
for row in df.iter_rows(named=True):
feature = {
"type": "Feature",
"geometry": {
"type": "Point",
"coordinates": [row["lon"], row["lat"]],
},
"properties": {
"count": 1,
"weight": 1,
"month": row["month"],
"crime_type": row["crime_type"],
},
}
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
return df.height
def build_crime_hotspot_tiles(
crime_dir: Path,
output_path: Path,
months: int,
min_zoom: int,
max_zoom: int,
) -> None:
tippecanoe = _require_tippecanoe()
selected_months = set(_latest_months(crime_dir, months))
csvs = _street_csvs_for_months(crime_dir, selected_months)
output_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
ndjson_path = Path(tmp) / "crime_hotspots.geojsonseq"
feature_count = _write_geojsonseq(csvs, ndjson_path)
print(
f"Writing {feature_count:,} approximate crime heatmap points "
f"from {min(selected_months)} to {max(selected_months)}"
)
subprocess.run(
[
tippecanoe,
"--force",
"--output",
str(output_path),
"--layer",
"crime_hotspots",
"--minimum-zoom",
str(min_zoom),
"--maximum-zoom",
str(max_zoom),
"--drop-densest-as-needed",
"--extend-zooms-if-still-dropping",
str(ndjson_path),
],
check=True,
)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--input", type=Path, required=True, help="Crime CSV directory")
parser.add_argument(
"--output", type=Path, required=True, help="Output .pmtiles path"
)
parser.add_argument(
"--months",
type=int,
default=12,
help="Latest complete months to include in the heatmap",
)
parser.add_argument("--min-zoom", type=int, default=12)
parser.add_argument("--max-zoom", type=int, default=16)
args = parser.parse_args()
build_crime_hotspot_tiles(
args.input,
args.output,
args.months,
args.min_zoom,
args.max_zoom,
)
if __name__ == "__main__":
main()

View file

@ -1,12 +1,27 @@
import argparse
import re
import numpy as np
import polars as pl
from pathlib import Path
import pyogrio
from pyproj import Transformer
from scipy.spatial import cKDTree
from shapely import from_wkb, points
from shapely.geometry.base import BaseGeometry
from shapely.strtree import STRtree
from thefuzz import fuzz
from pipeline.utils.fuzzy_join import normalize_address_key
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
LISTED_BUILDING_MIN_MATCH_SCORE = 95
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
@ -24,6 +39,8 @@ _AREA_COLUMNS = [
"lon",
# Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd",
# Keyed lookup for postcode-level side tables (e.g. crime time series).
"lsoa21",
# Deprivation
"Income Score",
"Employment Score",
@ -63,6 +80,7 @@ _AREA_COLUMNS = [
# Environment
"Noise (dB)",
"Max available download speed (Mbps)",
CONSERVATION_AREA_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
@ -97,6 +115,20 @@ _RENT_SOURCE_UNAVAILABLE_LADS = {
"E06000053": "Isles of Scilly",
"E09000001": "City of London",
}
_NUMBER_RE = re.compile(r"\d+")
_LISTED_NAME_STOP_WORDS = {
"A",
"AN",
"AND",
"AT",
"BY",
"IN",
"OF",
"ON",
"THE",
"TO",
"WITH",
}
def _is_dynamic_poi_metric_column(column: str) -> bool:
@ -105,6 +137,389 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
)
def _numbers_compatible(left: str, right: str) -> bool:
"""Require address/list-entry numbers to agree when either side has numbers."""
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
(left_nums, right_nums)
if len(left_nums) <= len(right_nums)
else (right_nums, left_nums)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _listed_candidate_schema() -> dict[str, pl.DataType]:
return {
"postcode": pl.Utf8,
"_listed_match_name": pl.Utf8,
"_listed_grade": pl.Utf8,
"_listed_entry": pl.Int64,
}
def _empty_listed_candidates() -> pl.DataFrame:
return pl.DataFrame(schema=_listed_candidate_schema())
def _empty_listed_property_flags() -> pl.DataFrame:
return pl.DataFrame(
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
LISTED_BUILDING_FEATURE: pl.Utf8,
}
)
def _is_matchable_listed_name(name_key: str | None) -> bool:
if not name_key:
return False
if _NUMBER_RE.search(name_key):
return True
substantive_tokens = [
token
for token in name_key.split()
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
]
return len(substantive_tokens) >= 2
def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
"""Load Historic England NHLE listed-building point attributes."""
columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
info = pyogrio.read_info(listed_buildings_path)
geometry_type = str(info.get("geometry_type") or "")
if "Point" not in geometry_type:
raise ValueError(
f"Expected listed-building point data, got geometry {geometry_type!r}"
)
_, table = pyogrio.read_arrow(
listed_buildings_path,
columns=columns,
read_geometry=False,
)
df = pl.from_arrow(table)
missing = sorted(set(columns) - set(df.columns))
if missing:
raise ValueError(
f"{listed_buildings_path} is missing listed-building columns: {missing}"
)
return (
df.select(
pl.col("ListEntry").cast(pl.Int64),
pl.col("Name").cast(pl.Utf8),
pl.col("Grade").cast(pl.Utf8),
pl.col("Easting").cast(pl.Float64),
pl.col("Northing").cast(pl.Float64),
)
.drop_nulls(["Name", "Easting", "Northing"])
.with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
.filter(pl.col("_listed_match_name").is_not_null())
)
def _postcode_listed_building_candidates(
listed_points: pl.DataFrame,
active_postcodes: pl.DataFrame,
*,
nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
) -> pl.DataFrame:
"""Assign each listed-building point to nearby active postcode candidates."""
if listed_points.is_empty() or active_postcodes.is_empty():
return _empty_listed_candidates()
required_postcode_cols = {"postcode", "east1m", "north1m"}
missing = sorted(required_postcode_cols - set(active_postcodes.columns))
if missing:
raise ValueError(f"Active postcode data missing required columns: {missing}")
required_listed_cols = {
"_listed_match_name",
"Grade",
"ListEntry",
"Easting",
"Northing",
}
missing = sorted(required_listed_cols - set(listed_points.columns))
if missing:
raise ValueError(f"Listed-building data missing required columns: {missing}")
postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
postcodes = postcodes.filter(
pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
)
listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
listed = listed.filter(
pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
)
if postcodes.is_empty() or listed.is_empty():
return _empty_listed_candidates()
postcode_coords = np.column_stack(
[postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
)
listed_coords = np.column_stack(
[listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
)
k = max(1, min(nearest_postcodes, postcodes.height))
distances, indices = cKDTree(postcode_coords).query(
listed_coords,
k=k,
distance_upper_bound=max_distance_m,
)
if k == 1:
distances = distances[:, np.newaxis]
indices = indices[:, np.newaxis]
postcode_values = postcodes["postcode"].to_list()
listed_names = listed["_listed_match_name"].to_list()
listed_grades = listed["Grade"].to_list()
listed_entries = listed["ListEntry"].to_list()
rows: list[tuple[str, str, str | None, int | None]] = []
for listed_idx in range(listed.height):
name_key = listed_names[listed_idx]
if not _is_matchable_listed_name(name_key):
continue
seen_postcodes: set[str] = set()
for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
if not np.isfinite(distance) or postcode_idx >= postcodes.height:
continue
postcode = postcode_values[int(postcode_idx)]
if postcode in seen_postcodes:
continue
seen_postcodes.add(postcode)
rows.append(
(
postcode,
name_key,
listed_grades[listed_idx],
listed_entries[listed_idx],
)
)
if not rows:
return _empty_listed_candidates()
return (
pl.DataFrame(
rows,
schema=[
"postcode",
"_listed_match_name",
"_listed_grade",
"_listed_entry",
],
orient="row",
)
.cast(_listed_candidate_schema())
.unique(["postcode", "_listed_match_name", "_listed_entry"])
)
def _matched_listed_building_flags(
properties: pl.LazyFrame,
listed_candidates: pl.DataFrame,
*,
min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
) -> pl.DataFrame:
"""Return property keys that conservatively match an NHLE listed entry."""
if listed_candidates.is_empty():
return _empty_listed_property_flags()
candidate_postcodes = listed_candidates.select("postcode").unique()
property_candidates = (
properties.select("postcode", "pp_address", "epc_address")
.join(candidate_postcodes.lazy(), on="postcode", how="semi")
.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
)
.filter(
pl.col("pp_address").is_not_null()
& (
pl.col("_pp_match_address").is_not_null()
| pl.col("_epc_match_address").is_not_null()
)
)
.collect(engine="streaming")
)
if property_candidates.is_empty():
return _empty_listed_property_flags()
listed_by_postcode: dict[str, list[str]] = {}
for postcode, name in listed_candidates.select(
"postcode", "_listed_match_name"
).iter_rows():
if postcode and name:
listed_by_postcode.setdefault(postcode, []).append(name)
matches: list[tuple[str, str, str]] = []
for row in property_candidates.iter_rows(named=True):
postcode = row["postcode"]
listed_names = listed_by_postcode.get(postcode)
if not listed_names:
continue
address_keys = []
for col in ("_pp_match_address", "_epc_match_address"):
value = row.get(col)
if value and value not in address_keys:
address_keys.append(value)
matched = False
for address_key in address_keys:
for listed_name in listed_names:
if not _numbers_compatible(address_key, listed_name):
continue
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
matched = True
break
if matched:
break
if matched:
matches.append((postcode, row["pp_address"], "Yes"))
if not matches:
return _empty_listed_property_flags()
return (
pl.DataFrame(
matches,
schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
orient="row",
)
.cast(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
LISTED_BUILDING_FEATURE: pl.Utf8,
}
)
.unique(["postcode", "pp_address"])
)
def _listed_building_flags(
properties: pl.LazyFrame,
active_postcodes: pl.DataFrame,
listed_buildings_path: Path,
) -> pl.DataFrame:
print(f"Loading listed-building points from {listed_buildings_path}...")
listed_points = _load_listed_building_points(listed_buildings_path)
print(f"Loaded {listed_points.height} listed-building point records")
listed_candidates = _postcode_listed_building_candidates(
listed_points, active_postcodes
)
print(
"Matching listed-building names to property addresses across "
f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
)
flags = _matched_listed_building_flags(properties, listed_candidates)
print(f"Matched {flags.height} property addresses to listed-building entries")
return flags
def _normalise_crs(crs: object | None) -> str:
return str(crs) if crs else "EPSG:4326"
def _load_conservation_area_geometries(
conservation_areas_path: Path,
) -> tuple[list[BaseGeometry], str]:
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
geometry_name = metadata.get("geometry_name") or table.column_names[-1]
geometries = []
for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
if geom is not None and not geom.is_empty:
geometries.append(geom)
if not geometries:
raise ValueError(
f"{conservation_areas_path} does not contain any usable polygon geometries"
)
return geometries, _normalise_crs(metadata.get("crs"))
def _postcode_conservation_area_flags(
postcodes: pl.DataFrame,
conservation_geometries: list[BaseGeometry],
conservation_crs: object | None,
batch_size: int = 100_000,
) -> pl.DataFrame:
required = {"postcode", "lat", "lon"}
missing = sorted(required - set(postcodes.columns))
if missing:
raise ValueError(f"Postcode data missing required columns: {missing}")
all_postcodes = postcodes.select("postcode").drop_nulls().unique()
valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
if valid_points.is_empty():
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
lat = valid_points["lat"].to_numpy()
lon = valid_points["lon"].to_numpy()
finite = np.isfinite(lat) & np.isfinite(lon)
valid_points = valid_points.filter(pl.Series(finite))
if valid_points.is_empty():
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
lat = valid_points["lat"].to_numpy()
lon = valid_points["lon"].to_numpy()
transformer = Transformer.from_crs(
"EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
)
x, y = transformer.transform(lon, lat)
tree = STRtree(conservation_geometries)
inside = np.zeros(valid_points.height, dtype=bool)
for start in range(0, valid_points.height, batch_size):
end = min(start + batch_size, valid_points.height)
point_batch = points(x[start:end], y[start:end])
matches = tree.query(point_batch, predicate="intersects")
if matches.size > 0:
inside[start + matches[0]] = True
matched = (
valid_points.select("postcode")
.with_columns(pl.Series("_within_conservation_area", inside))
.group_by("postcode")
.agg(pl.col("_within_conservation_area").max())
.with_columns(
pl.when(pl.col("_within_conservation_area"))
.then(pl.lit("Yes"))
.otherwise(pl.lit("No"))
.alias(CONSERVATION_AREA_FEATURE)
)
.select("postcode", CONSERVATION_AREA_FEATURE)
)
return (
all_postcodes.join(matched, on="postcode", how="left")
.with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
.select("postcode", CONSERVATION_AREA_FEATURE)
)
def _conservation_area_by_postcode(
postcodes: pl.LazyFrame,
conservation_areas_path: Path,
) -> pl.LazyFrame:
print(f"Loading conservation area polygons from {conservation_areas_path}...")
geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
postcode_points = postcodes.select("postcode", "lat", "lon").collect(
engine="streaming"
)
print(
"Computing conservation area membership for "
f"{postcode_points.height} active English postcodes..."
)
return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
non_null_count = pl.col(column).count()
@ -234,11 +649,13 @@ def _build(
noise_path: Path,
school_proximity_path: Path,
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_postcodes_path: Path | None = None,
listed_buildings_path: Path | None = None,
) -> tuple[pl.DataFrame, pl.DataFrame]:
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
@ -273,6 +690,29 @@ def _build(
).unique(["postcode"])
wide = wide.join(postcode_country, on="postcode", how="left")
if listed_buildings_path is not None:
active_postcodes_for_listed = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
"east1m",
"north1m",
)
.collect(engine="streaming")
)
listed_flags = _listed_building_flags(
wide.select("postcode", "pp_address", "epc_address"),
active_postcodes_for_listed,
listed_buildings_path,
)
wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
else:
wide = wide.with_columns(
pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
)
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
arcgis = (
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
.filter(pl.col("doterm").is_null()) # Active postcodes only
@ -382,6 +822,13 @@ def _build(
school_proximity = pl.scan_parquet(school_proximity_path)
wide = wide.join(school_proximity, on="postcode", how="left")
conservation_areas = _conservation_area_by_postcode(
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
)
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
)
if tree_density_postcodes_path is not None:
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
wide = wide.join(tree_density, on="postcode", how="left")
@ -476,7 +923,6 @@ def _build(
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
"Barriers to Housing and Services Score",
"lsoa21",
"oa21",
"pcon",
"epc_property_type",
@ -598,6 +1044,18 @@ def main():
required=True,
help="Broadband performance by output area parquet file",
)
parser.add_argument(
"--conservation-areas",
type=Path,
required=True,
help="Historic England conservation areas GeoPackage",
)
parser.add_argument(
"--listed-buildings",
type=Path,
required=False,
help="Historic England NHLE listed-building points GeoPackage",
)
parser.add_argument(
"--rental-prices",
type=Path,
@ -652,11 +1110,13 @@ def main():
noise_path=args.noise,
school_proximity_path=args.school_proximity,
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_postcodes_path=args.tree_density_postcodes,
listed_buildings_path=args.listed_buildings,
)
print(f"\nPostcode columns: {postcode_df.columns}")

View file

@ -0,0 +1,398 @@
"""Build PMTiles raster tiles for the high-resolution Defra noise overlay.
This keeps the native 10m strategic-noise rasters as the source of truth and
renders transparent PNG XYZ tiles into MBTiles before converting to PMTiles.
The dashboard serves the resulting archive through /api/overlays/noise.
"""
from __future__ import annotations
import argparse
import io
import math
import sqlite3
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import rasterio
from PIL import Image
from rasterio.enums import Resampling
from rasterio.transform import from_bounds
from rasterio.warp import reproject, transform_bounds
from shapely import STRtree, box
from pipeline.download.noise import (
BNG_MAX_E,
BNG_MAX_N,
BNG_MIN_E,
BNG_MIN_N,
NOISE_SOURCES,
download_raster,
)
from pipeline.download.tiles import ensure_pmtiles_cli
from pipeline.local_temp import local_tmp_dir
WEB_MERCATOR_CRS = "EPSG:3857"
WEB_MERCATOR_EXTENT = 20_037_508.342789244
DEFAULT_SOURCE_NAMES = ("road", "rail", "airport")
NOISE_COLOR_STOPS = np.array([45.0, 55.0, 65.0, 75.0], dtype=np.float32)
NOISE_COLORS = np.array(
[
[254, 240, 138],
[251, 146, 60],
[220, 38, 38],
[127, 29, 29],
],
dtype=np.float32,
)
@dataclass(frozen=True)
class RasterInfo:
path: Path
bounds_mercator: tuple[float, float, float, float]
def _source_specs(source_names: tuple[str, ...]):
requested = {name.lower() for name in source_names}
if "all" in requested:
requested = set(DEFAULT_SOURCE_NAMES)
by_name = {label.lower(): spec for label, *spec in NOISE_SOURCES}
unknown = sorted(requested - set(by_name))
if unknown:
raise ValueError(f"Unknown noise source(s): {', '.join(unknown)}")
return [
(name.title(), *by_name[name])
for name in DEFAULT_SOURCE_NAMES
if name in requested
]
def _download_source_rasters(
raster_dir: Path,
source_names: tuple[str, ...],
) -> list[Path]:
paths: list[Path] = []
raster_dir.mkdir(parents=True, exist_ok=True)
for (
label,
_col_name,
wcs_base,
coverage_id,
wcs_version,
allow_missing_tiles,
) in _source_specs(source_names):
tile_dir = raster_dir / label.lower()
tile_dir.mkdir(parents=True, exist_ok=True)
paths.extend(
download_raster(
tile_dir,
wcs_base,
coverage_id,
label,
wcs_version,
allow_missing_tiles,
)
)
return paths
def _raster_infos(raster_paths: list[Path]) -> list[RasterInfo]:
infos: list[RasterInfo] = []
for path in raster_paths:
with rasterio.open(path) as dataset:
if dataset.crs is None:
raise ValueError(f"Raster has no CRS: {path}")
bounds = transform_bounds(
dataset.crs,
WEB_MERCATOR_CRS,
*dataset.bounds,
densify_pts=21,
)
infos.append(RasterInfo(path=path, bounds_mercator=bounds))
return infos
def _england_bounds_wgs84() -> tuple[float, float, float, float]:
return transform_bounds(
"EPSG:27700",
"EPSG:4326",
BNG_MIN_E,
BNG_MIN_N,
BNG_MAX_E,
BNG_MAX_N,
densify_pts=21,
)
def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]:
lat = max(min(lat, 85.05112878), -85.05112878)
n = 1 << zoom
x = int(math.floor((lon + 180.0) / 360.0 * n))
y = int(
math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n)
)
return min(max(x, 0), n - 1), min(max(y, 0), n - 1)
def _tile_bounds_mercator(
zoom: int, x: int, y: int
) -> tuple[float, float, float, float]:
n = 1 << zoom
tile_size_m = WEB_MERCATOR_EXTENT * 2 / n
left = -WEB_MERCATOR_EXTENT + x * tile_size_m
right = left + tile_size_m
top = WEB_MERCATOR_EXTENT - y * tile_size_m
bottom = top - tile_size_m
return left, bottom, right, top
def _read_noise_tile(
candidates: list[RasterInfo],
bounds_mercator: tuple[float, float, float, float],
tile_size: int,
) -> np.ndarray:
left, bottom, right, top = bounds_mercator
merged = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
for info in candidates:
with rasterio.open(info.path) as source:
tile = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
reproject(
source=rasterio.band(source, 1),
destination=tile,
src_transform=source.transform,
src_crs=source.crs,
src_nodata=source.nodata if source.nodata is not None else 0,
dst_transform=from_bounds(
left, bottom, right, top, tile_size, tile_size
),
dst_crs=WEB_MERCATOR_CRS,
dst_nodata=np.nan,
resampling=Resampling.bilinear,
)
tile[~np.isfinite(tile) | (tile <= 0)] = np.nan
merged = np.fmax(merged, tile)
return merged
def _encode_noise_png(noise_db: np.ndarray) -> bytes | None:
valid = np.isfinite(noise_db) & (noise_db >= NOISE_COLOR_STOPS[0])
if not valid.any():
return None
clipped = np.clip(noise_db, NOISE_COLOR_STOPS[0], NOISE_COLOR_STOPS[-1])
rgba = np.zeros((*noise_db.shape, 4), dtype=np.uint8)
valid_values = clipped[valid]
for channel in range(3):
channel_values = np.interp(
valid_values,
NOISE_COLOR_STOPS,
NOISE_COLORS[:, channel],
).astype(np.uint8)
rgba[..., channel][valid] = channel_values
alpha = np.interp(
valid_values,
[NOISE_COLOR_STOPS[0], NOISE_COLOR_STOPS[-1]],
[70, 190],
).astype(np.uint8)
rgba[..., 3][valid] = alpha
output = io.BytesIO()
Image.fromarray(rgba, mode="RGBA").save(output, format="PNG", optimize=True)
return output.getvalue()
def _tile_ranges(
bounds_wgs84: tuple[float, float, float, float],
zoom: int,
) -> tuple[range, range]:
west, south, east, north = bounds_wgs84
min_x, min_y = _lonlat_to_tile(west, north, zoom)
max_x, max_y = _lonlat_to_tile(east, south, zoom)
return range(min_x, max_x + 1), range(min_y, max_y + 1)
def _create_mbtiles(
raster_infos: list[RasterInfo],
mbtiles_path: Path,
min_zoom: int,
max_zoom: int,
tile_size: int,
) -> int:
if mbtiles_path.exists():
mbtiles_path.unlink()
bounds_wgs84 = _england_bounds_wgs84()
geometries = [box(*info.bounds_mercator) for info in raster_infos]
tree = STRtree(geometries)
conn = sqlite3.connect(mbtiles_path)
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
conn.execute(
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
"tile_row INTEGER, tile_data BLOB)"
)
conn.execute(
"CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)"
)
conn.executemany(
"INSERT INTO metadata (name, value) VALUES (?, ?)",
[
("name", "Defra Lden noise overlay"),
("type", "overlay"),
("version", "1"),
("description", "Defra Round 4 10m strategic noise Lden overlay"),
("format", "png"),
(
"attribution",
"Contains public sector information licensed under the OGL v3.0",
),
("bounds", ",".join(f"{value:.6f}" for value in bounds_wgs84)),
("minzoom", str(min_zoom)),
("maxzoom", str(max_zoom)),
],
)
total_tiles = 0
try:
for zoom in range(min_zoom, max_zoom + 1):
x_range, y_range = _tile_ranges(bounds_wgs84, zoom)
zoom_tiles = 0
for x in x_range:
for y in y_range:
bounds_mercator = _tile_bounds_mercator(zoom, x, y)
candidate_indexes = tree.query(box(*bounds_mercator))
if len(candidate_indexes) == 0:
continue
candidates = [
raster_infos[int(index)] for index in candidate_indexes
]
tile = _read_noise_tile(candidates, bounds_mercator, tile_size)
tile_png = _encode_noise_png(tile)
if tile_png is None:
continue
tms_y = (1 << zoom) - 1 - y
conn.execute(
"INSERT INTO tiles VALUES (?, ?, ?, ?)",
(zoom, x, tms_y, tile_png),
)
zoom_tiles += 1
total_tiles += 1
conn.commit()
print(f"Zoom {zoom}: wrote {zoom_tiles:,} PNG tiles")
finally:
conn.close()
return total_tiles
def build_noise_overlay_tiles(
output_path: Path,
raster_dir: Path,
source_names: tuple[str, ...],
input_rasters: tuple[Path, ...],
pmtiles_bin: Path,
pmtiles_version: str,
min_zoom: int,
max_zoom: int,
tile_size: int,
) -> None:
if min_zoom > max_zoom:
raise ValueError("--min-zoom must be <= --max-zoom")
raster_paths = list(input_rasters) or _download_source_rasters(
raster_dir, source_names
)
if not raster_paths:
raise FileNotFoundError("No noise raster GeoTIFFs available")
print(f"Preparing {len(raster_paths):,} noise raster tile(s)")
raster_infos = _raster_infos(raster_paths)
output_path.parent.mkdir(parents=True, exist_ok=True)
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
mbtiles_path = Path(tmp) / "noise_lden_10m.mbtiles"
tile_count = _create_mbtiles(
raster_infos, mbtiles_path, min_zoom, max_zoom, tile_size
)
if tile_count == 0:
raise RuntimeError("Noise overlay generation produced no tiles")
subprocess.run(
[
str(pmtiles_bin),
"convert",
str(mbtiles_path),
str(output_path),
"--force",
],
check=True,
)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f"Wrote {output_path} ({size_mb:.1f} MB)")
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument(
"--raster-dir",
type=Path,
default=Path("property-data/noise_overlay_rasters"),
help="Cache directory for downloaded Defra WCS GeoTIFF tiles",
)
parser.add_argument(
"--source",
action="append",
dest="sources",
choices=("all", *DEFAULT_SOURCE_NAMES),
help="Noise source to include; repeatable. Defaults to all.",
)
parser.add_argument(
"--input-raster",
action="append",
dest="input_rasters",
type=Path,
help="Existing GeoTIFF to render instead of downloading WCS rasters",
)
parser.add_argument(
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
)
parser.add_argument("--pmtiles-version", default="1.22.3")
parser.add_argument("--min-zoom", type=int, default=13)
parser.add_argument("--max-zoom", type=int, default=14)
parser.add_argument("--tile-size", type=int, default=256)
args = parser.parse_args()
build_noise_overlay_tiles(
output_path=args.output,
raster_dir=args.raster_dir,
source_names=tuple(args.sources or ("all",)),
input_rasters=tuple(args.input_rasters or ()),
pmtiles_bin=args.pmtiles_bin,
pmtiles_version=args.pmtiles_version,
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
tile_size=args.tile_size,
)
if __name__ == "__main__":
main()

View file

@ -39,6 +39,8 @@ def main():
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
)
if ofsted.is_empty():
raise ValueError("No good+ primary/secondary Ofsted schools found")
print(f"Good+ schools: {len(ofsted):,}")
print(
@ -74,6 +76,8 @@ def main():
)
schools = ofsted.join(arcgis, on="postcode", how="inner")
if schools.is_empty():
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
print(f"Schools with coordinates: {len(schools):,}")
# Load all postcodes for proximity counting
@ -88,6 +92,7 @@ def main():
result = counts_5km.join(counts_2km, on="postcode")
args.output.parent.mkdir(parents=True, exist_ok=True)
result.write_parquet(args.output)
size_mb = args.output.stat().st_size / (1024 * 1024)
print(f"Wrote {args.output} ({size_mb:.1f} MB)")

View file

@ -95,6 +95,69 @@ def test_transform_crime_annualises_over_all_valid_months(tmp_path):
]
def test_transform_crime_writes_by_year_output(tmp_path):
crime_dir = tmp_path / "crime"
jan23 = crime_dir / "2023-01"
jan24 = crime_dir / "2024-01"
feb24 = crime_dir / "2024-02"
for d in (jan23, jan24, feb24):
d.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
(jan23 / "2023-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
"2,2023-01,F,F,-0.1,51.5,X,E01000001,L,Robbery,U,",
]
)
+ "\n"
)
(jan24 / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"3,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
"4,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
]
)
+ "\n"
)
(feb24 / "2024-02-test-force-street.csv").write_text(
"\n".join(
[
header,
"5,2024-02,F,F,-0.1,51.5,X,E01000001,L,Anti-social behaviour,U,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
by_year_output = tmp_path / "crime_by_year.parquet"
transform_crime(crime_dir, output, by_year_output)
by_year = pl.read_parquet(by_year_output)
assert by_year.height == 1
cols = set(by_year.columns)
assert "Burglary (by year)" in cols
assert "Serious crime (by year)" in cols
assert "Minor crime (by year)" in cols
row = by_year.row(0, named=True)
burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
# 2023: 1 burglary in 1 month → 12/yr; 2024: 2 in 2 months → 12/yr
assert burglary == [
{"year": 2023, "count": 12.0},
{"year": 2024, "count": 12.0},
]
# Serious crime in 2023 = Burglary(12) + Robbery(12) = 24
serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
assert serious[2023] == 24.0
assert serious[2024] == 12.0
def test_transform_crime_fails_without_valid_months(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
@ -117,3 +180,49 @@ def test_transform_crime_fails_without_valid_months(tmp_path):
assert "No valid crime months" in str(exc)
else:
raise AssertionError("Expected ValueError")
def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
# E01000001 was split into two 2021 LSOAs; E01000099 is unchanged.
(month_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
"2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
"3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
]
)
+ "\n"
)
lookup_path = tmp_path / "lookup.parquet"
pl.DataFrame(
{
"lsoa11": ["E01000001", "E01000001", "E01000099"],
"lsoa21": ["E01000050", "E01000051", "E01000099"],
}
).write_parquet(lookup_path)
output = tmp_path / "crime.parquet"
by_year_output = tmp_path / "by_year.parquet"
transform_crime(crime_dir, output, by_year_output, lookup_path)
# Split LSOA: 2 burglaries split evenly → 1/yr each child, annualised to 12/yr each.
avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
assert avg == [
{"LSOA code": "E01000050", "Burglary (avg/yr)": 12.0},
{"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
{"LSOA code": "E01000099", "Burglary (avg/yr)": 12.0},
]
by_year = pl.read_parquet(by_year_output).sort("LSOA code").to_dicts()
burglaries = {row["LSOA code"]: row["Burglary (by year)"] for row in by_year}
assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]

View file

@ -1,11 +1,17 @@
import polars as pl
import pytest
from shapely import box
from pipeline.transform.merge import (
_AREA_COLUMNS,
CONSERVATION_AREA_FEATURE,
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_property_postcodes,
@ -48,6 +54,106 @@ def test_country_code_is_kept_in_postcode_area_columns() -> None:
assert "ctry25cd" in _AREA_COLUMNS
def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
def test_postcode_conservation_area_flags_marks_point_membership() -> None:
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
"lat": [0.5, 2.0, None],
"lon": [0.5, 2.0, 0.5],
}
)
result = _postcode_conservation_area_flags(
postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
).sort("postcode")
assert result.to_dicts() == [
{"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
{"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
{"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
]
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
listed_points = pl.DataFrame(
{
"ListEntry": [1234, 5678],
"Name": ["1 and 2 High Street", "Distant Hall"],
"Grade": ["II", "I"],
"Easting": [100.0, 1000.0],
"Northing": [100.0, 1000.0],
}
).with_columns(
pl.col("Name")
.str.to_uppercase()
.str.replace_all(r"[^0-9A-Z]+", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
.alias("_listed_match_name")
)
active_postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB"],
"east1m": [105.0, 5000.0],
"north1m": [105.0, 5000.0],
}
)
result = _postcode_listed_building_candidates(
listed_points,
active_postcodes,
nearest_postcodes=1,
max_distance_m=25,
)
assert result.select("postcode", "_listed_match_name").to_dicts() == [
{"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
]
def test_matched_listed_building_flags_requires_address_match() -> None:
properties = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
"pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
"epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
}
)
listed_candidates = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB"],
"_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
"_listed_grade": ["II", "II*"],
"_listed_entry": [1234, 5678],
}
)
result = _matched_listed_building_flags(
properties.lazy(), listed_candidates, min_score=95
).sort("postcode", "pp_address")
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"pp_address": "1 HIGH STREET",
LISTED_BUILDING_FEATURE: "Yes",
},
{
"postcode": "BB1 1BB",
"pp_address": "THE OLD RECTORY",
LISTED_BUILDING_FEATURE: "Yes",
},
]
def test_validate_property_postcodes_rejects_blank_rows() -> None:
df = pl.DataFrame(
{

View file

@ -182,6 +182,19 @@ DROP_CATEGORIES = {
"public_transport/platform",
"public_transport/station",
"public_transport/stop_position",
# Education amenities — schools come from GIAS instead. OSM coverage for
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
# useful on a property-search map.
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
}
@ -943,23 +956,10 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
"tourism/chalet",
],
),
(
"Education",
"School",
"🏫",
[
"amenity/school",
"amenity/prep_school",
"amenity/language_school",
"amenity/music_school",
"amenity/university",
"amenity/college",
"building/university",
"amenity/kindergarten",
"amenity/childcare",
"office/tutoring",
],
),
# Note: schools come from the GIAS register (see transform_gias_schools).
# Niche/tertiary education amenities that GIAS does not cover are dropped
# rather than mixed in with state-funded schools.
(
"Local Businesses",
"Hotel",
@ -1316,11 +1316,45 @@ def transform_grocery_retail_points(
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata."""
return pl.scan_parquet(gias_path).select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
pl.lit("School").alias("category"),
pl.lit("School").alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
pl.lit("🏫").alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
pl.col("age_range").alias("school_age_range"),
pl.col("gender").alias("school_gender"),
pl.col("religious_character").alias("school_religious_character"),
pl.col("admissions_policy").alias("school_admissions_policy"),
pl.col("nursery_provision").alias("school_nursery_provision"),
pl.col("sixth_form").alias("school_sixth_form"),
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
pl.col("trust").alias("school_trust"),
pl.col("address").alias("school_address"),
pl.col("postcode").alias("school_postcode"),
pl.col("local_authority").alias("school_local_authority"),
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
)
def transform(
input_path: Path,
naptan_path: Path | None = None,
boundary_path: Path | None = None,
grocery_retail_points_path: Path | None = None,
naptan_path: Path,
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
@ -1372,24 +1406,21 @@ def transform(
)
naptan_df = pl.scan_parquet(naptan_path).collect()
if boundary_path is not None:
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
mask = in_england_mask(
boundary_path,
naptan_df["lat"].to_numpy(),
naptan_df["lng"].to_numpy(),
)
naptan_df = naptan_df.filter(pl.Series(mask))
naptan = naptan_df.lazy().with_columns(
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
pl.lit("Public Transport").alias("group"),
pl.col("category").alias("icon_category"),
)
frames = [lf, naptan]
if grocery_retail_points_path is not None:
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames.append(grocery_pois.lazy())
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
return pl.concat(frames, how="diagonal_relaxed")
@ -1413,8 +1444,15 @@ def main():
parser.add_argument(
"--grocery-retail-points",
type=Path,
required=True,
help="GEOLYTIX Grocery Retail Points parquet",
)
parser.add_argument(
"--gias",
type=Path,
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
@ -1425,6 +1463,7 @@ def main():
args.naptan,
args.boundary,
args.grocery_retail_points,
args.gias,
).collect(engine="streaming")
df.write_parquet(args.output)

View file

@ -0,0 +1,269 @@
"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay."""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
import tempfile
from pathlib import Path
import numpy as np
import pyogrio
import shapely
from pyproj import Transformer
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.tree_density import (
DEFAULT_TOW_TYPES,
_layers,
_tow_dataset_path,
_where_for_tow_types,
)
def _require_tippecanoe() -> str:
executable = shutil.which("tippecanoe")
if executable is None:
raise RuntimeError(
"tippecanoe is required to build tree overlay PMTiles. "
"Install tippecanoe and rerun this target."
)
return executable
def _column_or_none(batch, names: list[str], column: str):
if column not in names:
return None
return batch.column(names.index(column)).to_numpy(zero_copy_only=False)
def _number_or_none(value) -> float | int | None:
if value is None:
return None
try:
if np.isfinite(value):
if float(value).is_integer():
return int(value)
return round(float(value), 2)
except TypeError:
return None
return None
def _write_tree_geojsonseq(
dataset_path: str,
output_path: Path,
tow_types: tuple[str, ...],
batch_size: int,
layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None,
) -> int:
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
where = _where_for_tow_types(tow_types)
layers = _layers(dataset_path, layer_names)
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
if where:
print(f"TOW type filter: {where}")
columns = [
"TOW_ID",
"Woodland_Type",
"TOW_Area_M",
"MEANHT",
"MINHT",
"MAXHT",
"LiDAR_Survey_Year",
]
feature_count = 0
with output_path.open("w") as file:
for layer in layers:
info = pyogrio.read_info(dataset_path, layer=layer)
print(f"\nLayer {layer}: {info.get('features', 0):,} features")
layer_features_seen = 0
with pyogrio.open_arrow(
dataset_path,
layer=layer,
columns=columns,
where=where,
batch_size=batch_size,
use_pyarrow=True,
) as (_meta, reader):
for batch in reader:
if max_features_per_layer is not None:
remaining = max_features_per_layer - layer_features_seen
if remaining <= 0:
break
if batch.num_rows > remaining:
batch = batch.slice(0, remaining)
layer_features_seen += batch.num_rows
names = batch.schema.names
area = np.asarray(
batch.column(names.index("TOW_Area_M")).to_numpy(
zero_copy_only=False
),
dtype=np.float64,
)
geometry = np.asarray(
batch.column(names.index("SHAPE")).to_numpy(
zero_copy_only=False
),
dtype=object,
)
valid = np.isfinite(area) & (area > 0)
if not valid.any():
continue
tow_id = _column_or_none(batch, names, "TOW_ID")
woodland_type = _column_or_none(batch, names, "Woodland_Type")
mean_height = _column_or_none(batch, names, "MEANHT")
min_height = _column_or_none(batch, names, "MINHT")
max_height = _column_or_none(batch, names, "MAXHT")
lidar_year = _column_or_none(batch, names, "LiDAR_Survey_Year")
geometries = shapely.from_wkb(geometry[valid])
geometries = shapely.transform(
geometries,
to_wgs84.transform,
interleaved=False,
)
geometries_json = shapely.to_geojson(geometries)
valid_indexes = np.flatnonzero(valid)
for idx, geometry_json in zip(valid_indexes, geometries_json):
properties = {
"tow_id": str(tow_id[idx]) if tow_id is not None else "",
"woodland_type": (
str(woodland_type[idx])
if woodland_type is not None
else ""
),
"area_sqm": _number_or_none(area[idx]),
"mean_height_m": (
_number_or_none(mean_height[idx])
if mean_height is not None
else None
),
"min_height_m": (
_number_or_none(min_height[idx])
if min_height is not None
else None
),
"max_height_m": (
_number_or_none(max_height[idx])
if max_height is not None
else None
),
"lidar_year": (
_number_or_none(lidar_year[idx])
if lidar_year is not None
else None
),
"source_layer": layer,
}
feature = {
"type": "Feature",
"geometry": json.loads(geometry_json),
"properties": properties,
}
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
feature_count += 1
return feature_count
def build_tree_overlay_tiles(
tow_zip: Path,
output_path: Path,
extract_dir: Path,
tow_types: tuple[str, ...],
batch_size: int,
layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None,
min_zoom: int,
max_zoom: int,
force_extract: bool,
use_vsizip: bool,
) -> None:
tippecanoe = _require_tippecanoe()
dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
output_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
ndjson_path = Path(tmp) / "trees_outside_woodlands.geojsonseq"
feature_count = _write_tree_geojsonseq(
dataset_path,
ndjson_path,
tow_types,
batch_size,
layer_names,
max_features_per_layer,
)
print(f"Writing {feature_count:,} TOW polygon features")
subprocess.run(
[
tippecanoe,
"--force",
"--output",
str(output_path),
"--layer",
"trees_outside_woodlands",
"--minimum-zoom",
str(min_zoom),
"--maximum-zoom",
str(max_zoom),
"--drop-smallest-as-needed",
"--extend-zooms-if-still-dropping",
str(ndjson_path),
],
check=True,
)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--tow-zip", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument(
"--extract-dir",
type=Path,
default=Path("property-data/fr_tow_v1_all"),
help="Directory used to extract the FileGDB",
)
parser.add_argument(
"--tow-type",
action="append",
dest="tow_types",
help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
)
parser.add_argument("--batch-size", type=int, default=50_000)
parser.add_argument("--layer", action="append", dest="layers")
parser.add_argument("--max-features-per-layer", type=int)
parser.add_argument("--min-zoom", type=int, default=15)
parser.add_argument("--max-zoom", type=int, default=17)
parser.add_argument("--force-extract", action="store_true")
parser.add_argument("--use-vsizip", action="store_true")
args = parser.parse_args()
build_tree_overlay_tiles(
tow_zip=args.tow_zip,
output_path=args.output,
extract_dir=args.extract_dir,
tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
batch_size=args.batch_size,
layer_names=tuple(args.layers) if args.layers else None,
max_features_per_layer=args.max_features_per_layer,
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
force_extract=args.force_extract,
use_vsizip=args.use_vsizip,
)
if __name__ == "__main__":
main()