has issues
This commit is contained in:
parent
2e112d7398
commit
c645b0f1d4
96 changed files with 2147083 additions and 5787 deletions
|
|
@ -7,6 +7,27 @@ import polars as pl
|
|||
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
|
||||
MONTH_RE = r"^\d{4}-\d{2}$"
|
||||
|
||||
# Crime types that roll up into "Serious crime" / "Minor crime" aggregates.
|
||||
# Must match the names used in pipeline/transform/merge.py for the sum_horizontal expressions.
|
||||
SERIOUS_CRIME_TYPES = (
|
||||
"Violence and sexual offences",
|
||||
"Robbery",
|
||||
"Burglary",
|
||||
"Possession of weapons",
|
||||
)
|
||||
MINOR_CRIME_TYPES = (
|
||||
"Anti-social behaviour",
|
||||
"Criminal damage and arson",
|
||||
"Shoplifting",
|
||||
"Bicycle theft",
|
||||
"Theft from the person",
|
||||
"Other theft",
|
||||
"Vehicle crime",
|
||||
"Public order",
|
||||
"Drugs",
|
||||
"Other crime",
|
||||
)
|
||||
|
||||
|
||||
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
|
|
@ -14,7 +35,12 @@ def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
|||
return street_csvs, len(csvs) - len(street_csvs)
|
||||
|
||||
|
||||
def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
||||
def transform_crime(
|
||||
crime_dir: Path,
|
||||
output_path: Path,
|
||||
by_year_output_path: Path | None = None,
|
||||
lsoa_lookup_path: Path | None = None,
|
||||
) -> None:
|
||||
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
|
||||
if not csvs:
|
||||
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
|
||||
|
|
@ -38,6 +64,8 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
},
|
||||
).select("LSOA code", "Crime type", "Month")
|
||||
|
||||
df = _apply_lsoa_2011_to_2021(df, lsoa_lookup_path)
|
||||
|
||||
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
|
||||
valid_months = (
|
||||
df.filter(valid_month_expr)
|
||||
|
|
@ -57,6 +85,9 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
)
|
||||
|
||||
# Count monthly incidents, then annualise over every valid month in the dataset.
|
||||
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
|
||||
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
|
||||
# don't know which child a given incident actually belonged to.
|
||||
yearly_counts = (
|
||||
df.filter(
|
||||
valid_month_expr
|
||||
|
|
@ -66,7 +97,7 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
& (pl.col("Crime type") != "")
|
||||
)
|
||||
.group_by("LSOA code", "Month", "Crime type")
|
||||
.agg(pl.len().alias("count"))
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(
|
||||
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
|
||||
|
|
@ -98,6 +129,118 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|||
wide.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
if by_year_output_path is not None:
|
||||
_write_crime_by_year(df, valid_month_expr, by_year_output_path)
|
||||
|
||||
|
||||
def _write_crime_by_year(
|
||||
df: pl.LazyFrame, valid_month_expr: pl.Expr, by_year_output_path: Path
|
||||
) -> None:
|
||||
"""Emit per-LSOA per-type per-year crime counts as nested list[struct] columns.
|
||||
|
||||
Partial years are scaled to a 12-month-equivalent count so cross-year trends
|
||||
aren't distorted by months missing from the source data.
|
||||
"""
|
||||
filtered = df.filter(
|
||||
valid_month_expr
|
||||
& pl.col("LSOA code").is_not_null()
|
||||
& (pl.col("LSOA code") != "")
|
||||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
|
||||
|
||||
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
|
||||
# Using crime-type-specific months would over-scale years where a rare type appears
|
||||
# in only some months.
|
||||
months_per_year = filtered.group_by("year").agg(
|
||||
pl.col("Month").n_unique().alias("months_in_year")
|
||||
)
|
||||
|
||||
yearly_per_type = (
|
||||
filtered.group_by("LSOA code", "Crime type", "year", "Month")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "Crime type", "year")
|
||||
.agg(pl.col("count").sum().alias("count"))
|
||||
.join(months_per_year, on="year")
|
||||
.with_columns(
|
||||
(pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
|
||||
.round(1)
|
||||
.alias("count")
|
||||
)
|
||||
.select("LSOA code", "Crime type", "year", "count")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
if yearly_per_type.is_empty():
|
||||
raise ValueError("No valid crime rows for by-year output")
|
||||
|
||||
serious_rollup = _rollup_long(yearly_per_type, SERIOUS_CRIME_TYPES, "Serious crime")
|
||||
minor_rollup = _rollup_long(yearly_per_type, MINOR_CRIME_TYPES, "Minor crime")
|
||||
combined = pl.concat([yearly_per_type, serious_rollup, minor_rollup])
|
||||
|
||||
by_lsoa_type = (
|
||||
combined.sort("year")
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(pl.struct("year", "count").alias("series"))
|
||||
)
|
||||
|
||||
wide_by_year = by_lsoa_type.pivot(
|
||||
on="Crime type", index="LSOA code", values="series"
|
||||
)
|
||||
|
||||
type_cols = [c for c in wide_by_year.columns if c != "LSOA code"]
|
||||
wide_by_year = wide_by_year.rename({col: f"{col} (by year)" for col in type_cols})
|
||||
|
||||
print(f"By-year output shape: {wide_by_year.shape}")
|
||||
print(f"By-year columns: {wide_by_year.columns}")
|
||||
|
||||
wide_by_year.write_parquet(by_year_output_path, compression="zstd")
|
||||
print(f"Saved by-year output to {by_year_output_path}")
|
||||
|
||||
|
||||
def _rollup_long(
|
||||
yearly_per_type: pl.DataFrame, types: tuple[str, ...], rollup_name: str
|
||||
) -> pl.DataFrame:
|
||||
"""Sum per-year counts across a set of crime types into a single rollup type."""
|
||||
return (
|
||||
yearly_per_type.filter(pl.col("Crime type").is_in(list(types)))
|
||||
.group_by("LSOA code", "year")
|
||||
.agg(pl.col("count").sum().round(1).alias("count"))
|
||||
.with_columns(pl.lit(rollup_name).alias("Crime type"))
|
||||
.select("LSOA code", "Crime type", "year", "count")
|
||||
)
|
||||
|
||||
|
||||
def _apply_lsoa_2011_to_2021(
|
||||
df: pl.LazyFrame, lsoa_lookup_path: Path | None
|
||||
) -> pl.LazyFrame:
|
||||
"""Remap pre-2022 LSOA 2011 codes to LSOA 2021 codes.
|
||||
|
||||
Police.uk reports older years using LSOA 2011 codes; the rest of the pipeline
|
||||
keys on LSOA 2021. Without remapping, those years silently fail to join and
|
||||
the crime-over-time chart only shows post-2022 data.
|
||||
|
||||
For 1:1 mappings the LSOA code is rewritten in place. For 1→N splits (one
|
||||
2011 LSOA becoming several 2021 ones), each child gets an even share via
|
||||
`_weight = 1/N` since the source CSVs don't tell us which child a given
|
||||
incident actually fell into.
|
||||
"""
|
||||
if lsoa_lookup_path is None:
|
||||
return df.with_columns(pl.lit(1.0).alias("_weight"))
|
||||
|
||||
lookup = pl.scan_parquet(lsoa_lookup_path).select("lsoa11", "lsoa21")
|
||||
weighted = lookup.with_columns(
|
||||
(1.0 / pl.col("lsoa21").count().over("lsoa11")).alias("_weight")
|
||||
)
|
||||
return (
|
||||
df.join(weighted, left_on="LSOA code", right_on="lsoa11", how="left")
|
||||
.with_columns(
|
||||
pl.coalesce("lsoa21", "LSOA code").alias("LSOA code"),
|
||||
pl.col("_weight").fill_null(1.0),
|
||||
)
|
||||
.drop("lsoa21")
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
|
|
@ -109,8 +252,22 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-by-year",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Optional output parquet for per-LSOA per-year per-type counts (nested list[struct])",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lsoa-lookup",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Optional parquet with columns (lsoa11, lsoa21) for remapping pre-2022 codes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
transform_crime(args.input, args.output)
|
||||
transform_crime(
|
||||
args.input, args.output, args.output_by_year, args.lsoa_lookup
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
159
pipeline/transform/crime_hotspot_tiles.py
Normal file
159
pipeline/transform/crime_hotspot_tiles.py
Normal file
|
|
@ -0,0 +1,159 @@
|
|||
"""Build PMTiles point tiles for the crime heatmap overlay.
|
||||
|
||||
The output intentionally keeps point features rather than H3/grid aggregates so
|
||||
MapLibre can render a true client-side heatmap. Police.uk coordinates are
|
||||
published anonymous map points, not exact offence locations.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.crime import find_street_crime_csvs
|
||||
|
||||
|
||||
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
|
||||
csvs, _ignored = find_street_crime_csvs(crime_dir)
|
||||
months = sorted({path.parent.name for path in csvs})
|
||||
if not months:
|
||||
raise FileNotFoundError(f"No street crime CSVs found in {crime_dir}")
|
||||
return months[-month_count:]
|
||||
|
||||
|
||||
def _street_csvs_for_months(crime_dir: Path, months: set[str]) -> list[Path]:
|
||||
csvs, _ignored = find_street_crime_csvs(crime_dir)
|
||||
selected = [path for path in csvs if path.parent.name in months]
|
||||
if not selected:
|
||||
raise FileNotFoundError(f"No street crime CSVs found for {sorted(months)}")
|
||||
return selected
|
||||
|
||||
|
||||
def _require_tippecanoe() -> str:
|
||||
executable = shutil.which("tippecanoe")
|
||||
if executable is None:
|
||||
raise RuntimeError(
|
||||
"tippecanoe is required to build crime hotspot PMTiles. "
|
||||
"Install tippecanoe and rerun this target."
|
||||
)
|
||||
return executable
|
||||
|
||||
|
||||
def _write_geojsonseq(csvs: list[Path], output_path: Path) -> int:
|
||||
df = (
|
||||
pl.scan_csv(
|
||||
csvs,
|
||||
schema_overrides={
|
||||
"Longitude": pl.Float64,
|
||||
"Latitude": pl.Float64,
|
||||
"Month": pl.Utf8,
|
||||
"Crime type": pl.Utf8,
|
||||
},
|
||||
ignore_errors=True,
|
||||
)
|
||||
.select(
|
||||
pl.col("Longitude").alias("lon"),
|
||||
pl.col("Latitude").alias("lat"),
|
||||
pl.col("Month").alias("month"),
|
||||
pl.col("Crime type").alias("crime_type"),
|
||||
)
|
||||
.drop_nulls(["lon", "lat"])
|
||||
.filter(pl.col("lon").is_between(-9.5, 5.0))
|
||||
.filter(pl.col("lat").is_between(49.0, 57.0))
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
with output_path.open("w") as file:
|
||||
for row in df.iter_rows(named=True):
|
||||
feature = {
|
||||
"type": "Feature",
|
||||
"geometry": {
|
||||
"type": "Point",
|
||||
"coordinates": [row["lon"], row["lat"]],
|
||||
},
|
||||
"properties": {
|
||||
"count": 1,
|
||||
"weight": 1,
|
||||
"month": row["month"],
|
||||
"crime_type": row["crime_type"],
|
||||
},
|
||||
}
|
||||
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
|
||||
|
||||
return df.height
|
||||
|
||||
|
||||
def build_crime_hotspot_tiles(
|
||||
crime_dir: Path,
|
||||
output_path: Path,
|
||||
months: int,
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
) -> None:
|
||||
tippecanoe = _require_tippecanoe()
|
||||
selected_months = set(_latest_months(crime_dir, months))
|
||||
csvs = _street_csvs_for_months(crime_dir, selected_months)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||
ndjson_path = Path(tmp) / "crime_hotspots.geojsonseq"
|
||||
feature_count = _write_geojsonseq(csvs, ndjson_path)
|
||||
print(
|
||||
f"Writing {feature_count:,} approximate crime heatmap points "
|
||||
f"from {min(selected_months)} to {max(selected_months)}"
|
||||
)
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
tippecanoe,
|
||||
"--force",
|
||||
"--output",
|
||||
str(output_path),
|
||||
"--layer",
|
||||
"crime_hotspots",
|
||||
"--minimum-zoom",
|
||||
str(min_zoom),
|
||||
"--maximum-zoom",
|
||||
str(max_zoom),
|
||||
"--drop-densest-as-needed",
|
||||
"--extend-zooms-if-still-dropping",
|
||||
str(ndjson_path),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--input", type=Path, required=True, help="Crime CSV directory")
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output .pmtiles path"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--months",
|
||||
type=int,
|
||||
default=12,
|
||||
help="Latest complete months to include in the heatmap",
|
||||
)
|
||||
parser.add_argument("--min-zoom", type=int, default=12)
|
||||
parser.add_argument("--max-zoom", type=int, default=16)
|
||||
args = parser.parse_args()
|
||||
|
||||
build_crime_hotspot_tiles(
|
||||
args.input,
|
||||
args.output,
|
||||
args.months,
|
||||
args.min_zoom,
|
||||
args.max_zoom,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,12 +1,27 @@
|
|||
import argparse
|
||||
import re
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from pathlib import Path
|
||||
|
||||
import pyogrio
|
||||
from pyproj import Transformer
|
||||
from scipy.spatial import cKDTree
|
||||
from shapely import from_wkb, points
|
||||
from shapely.geometry.base import BaseGeometry
|
||||
from shapely.strtree import STRtree
|
||||
from thefuzz import fuzz
|
||||
|
||||
from pipeline.utils.fuzzy_join import normalize_address_key
|
||||
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
||||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
CONSERVATION_AREA_FEATURE = "Within conservation area"
|
||||
LISTED_BUILDING_FEATURE = "Listed building"
|
||||
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
||||
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
||||
LISTED_BUILDING_MIN_MATCH_SCORE = 95
|
||||
|
||||
_IOD_PERCENTILE_COLUMNS = [
|
||||
"Education, Skills and Training Score",
|
||||
|
|
@ -24,6 +39,8 @@ _AREA_COLUMNS = [
|
|||
"lon",
|
||||
# Runtime provenance for deciding whether missing coordinates are skippable.
|
||||
"ctry25cd",
|
||||
# Keyed lookup for postcode-level side tables (e.g. crime time series).
|
||||
"lsoa21",
|
||||
# Deprivation
|
||||
"Income Score",
|
||||
"Employment Score",
|
||||
|
|
@ -63,6 +80,7 @@ _AREA_COLUMNS = [
|
|||
# Environment
|
||||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
CONSERVATION_AREA_FEATURE,
|
||||
# Schools
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
|
|
@ -97,6 +115,20 @@ _RENT_SOURCE_UNAVAILABLE_LADS = {
|
|||
"E06000053": "Isles of Scilly",
|
||||
"E09000001": "City of London",
|
||||
}
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
_LISTED_NAME_STOP_WORDS = {
|
||||
"A",
|
||||
"AN",
|
||||
"AND",
|
||||
"AT",
|
||||
"BY",
|
||||
"IN",
|
||||
"OF",
|
||||
"ON",
|
||||
"THE",
|
||||
"TO",
|
||||
"WITH",
|
||||
}
|
||||
|
||||
|
||||
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
||||
|
|
@ -105,6 +137,389 @@ def _is_dynamic_poi_metric_column(column: str) -> bool:
|
|||
)
|
||||
|
||||
|
||||
def _numbers_compatible(left: str, right: str) -> bool:
|
||||
"""Require address/list-entry numbers to agree when either side has numbers."""
|
||||
left_nums = set(_NUMBER_RE.findall(left))
|
||||
right_nums = set(_NUMBER_RE.findall(right))
|
||||
smaller, larger = (
|
||||
(left_nums, right_nums)
|
||||
if len(left_nums) <= len(right_nums)
|
||||
else (right_nums, left_nums)
|
||||
)
|
||||
if not smaller and larger:
|
||||
return False
|
||||
return smaller.issubset(larger)
|
||||
|
||||
|
||||
def _listed_candidate_schema() -> dict[str, pl.DataType]:
|
||||
return {
|
||||
"postcode": pl.Utf8,
|
||||
"_listed_match_name": pl.Utf8,
|
||||
"_listed_grade": pl.Utf8,
|
||||
"_listed_entry": pl.Int64,
|
||||
}
|
||||
|
||||
|
||||
def _empty_listed_candidates() -> pl.DataFrame:
|
||||
return pl.DataFrame(schema=_listed_candidate_schema())
|
||||
|
||||
|
||||
def _empty_listed_property_flags() -> pl.DataFrame:
|
||||
return pl.DataFrame(
|
||||
schema={
|
||||
"postcode": pl.Utf8,
|
||||
"pp_address": pl.Utf8,
|
||||
LISTED_BUILDING_FEATURE: pl.Utf8,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _is_matchable_listed_name(name_key: str | None) -> bool:
|
||||
if not name_key:
|
||||
return False
|
||||
if _NUMBER_RE.search(name_key):
|
||||
return True
|
||||
substantive_tokens = [
|
||||
token
|
||||
for token in name_key.split()
|
||||
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
|
||||
]
|
||||
return len(substantive_tokens) >= 2
|
||||
|
||||
|
||||
def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
|
||||
"""Load Historic England NHLE listed-building point attributes."""
|
||||
columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
|
||||
info = pyogrio.read_info(listed_buildings_path)
|
||||
geometry_type = str(info.get("geometry_type") or "")
|
||||
if "Point" not in geometry_type:
|
||||
raise ValueError(
|
||||
f"Expected listed-building point data, got geometry {geometry_type!r}"
|
||||
)
|
||||
_, table = pyogrio.read_arrow(
|
||||
listed_buildings_path,
|
||||
columns=columns,
|
||||
read_geometry=False,
|
||||
)
|
||||
df = pl.from_arrow(table)
|
||||
missing = sorted(set(columns) - set(df.columns))
|
||||
if missing:
|
||||
raise ValueError(
|
||||
f"{listed_buildings_path} is missing listed-building columns: {missing}"
|
||||
)
|
||||
return (
|
||||
df.select(
|
||||
pl.col("ListEntry").cast(pl.Int64),
|
||||
pl.col("Name").cast(pl.Utf8),
|
||||
pl.col("Grade").cast(pl.Utf8),
|
||||
pl.col("Easting").cast(pl.Float64),
|
||||
pl.col("Northing").cast(pl.Float64),
|
||||
)
|
||||
.drop_nulls(["Name", "Easting", "Northing"])
|
||||
.with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
|
||||
.filter(pl.col("_listed_match_name").is_not_null())
|
||||
)
|
||||
|
||||
|
||||
def _postcode_listed_building_candidates(
|
||||
listed_points: pl.DataFrame,
|
||||
active_postcodes: pl.DataFrame,
|
||||
*,
|
||||
nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
|
||||
max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
|
||||
) -> pl.DataFrame:
|
||||
"""Assign each listed-building point to nearby active postcode candidates."""
|
||||
if listed_points.is_empty() or active_postcodes.is_empty():
|
||||
return _empty_listed_candidates()
|
||||
|
||||
required_postcode_cols = {"postcode", "east1m", "north1m"}
|
||||
missing = sorted(required_postcode_cols - set(active_postcodes.columns))
|
||||
if missing:
|
||||
raise ValueError(f"Active postcode data missing required columns: {missing}")
|
||||
|
||||
required_listed_cols = {
|
||||
"_listed_match_name",
|
||||
"Grade",
|
||||
"ListEntry",
|
||||
"Easting",
|
||||
"Northing",
|
||||
}
|
||||
missing = sorted(required_listed_cols - set(listed_points.columns))
|
||||
if missing:
|
||||
raise ValueError(f"Listed-building data missing required columns: {missing}")
|
||||
|
||||
postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
|
||||
postcodes = postcodes.filter(
|
||||
pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
|
||||
)
|
||||
listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
|
||||
listed = listed.filter(
|
||||
pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
|
||||
)
|
||||
if postcodes.is_empty() or listed.is_empty():
|
||||
return _empty_listed_candidates()
|
||||
|
||||
postcode_coords = np.column_stack(
|
||||
[postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
|
||||
)
|
||||
listed_coords = np.column_stack(
|
||||
[listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
|
||||
)
|
||||
k = max(1, min(nearest_postcodes, postcodes.height))
|
||||
distances, indices = cKDTree(postcode_coords).query(
|
||||
listed_coords,
|
||||
k=k,
|
||||
distance_upper_bound=max_distance_m,
|
||||
)
|
||||
if k == 1:
|
||||
distances = distances[:, np.newaxis]
|
||||
indices = indices[:, np.newaxis]
|
||||
|
||||
postcode_values = postcodes["postcode"].to_list()
|
||||
listed_names = listed["_listed_match_name"].to_list()
|
||||
listed_grades = listed["Grade"].to_list()
|
||||
listed_entries = listed["ListEntry"].to_list()
|
||||
|
||||
rows: list[tuple[str, str, str | None, int | None]] = []
|
||||
for listed_idx in range(listed.height):
|
||||
name_key = listed_names[listed_idx]
|
||||
if not _is_matchable_listed_name(name_key):
|
||||
continue
|
||||
seen_postcodes: set[str] = set()
|
||||
for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
|
||||
if not np.isfinite(distance) or postcode_idx >= postcodes.height:
|
||||
continue
|
||||
postcode = postcode_values[int(postcode_idx)]
|
||||
if postcode in seen_postcodes:
|
||||
continue
|
||||
seen_postcodes.add(postcode)
|
||||
rows.append(
|
||||
(
|
||||
postcode,
|
||||
name_key,
|
||||
listed_grades[listed_idx],
|
||||
listed_entries[listed_idx],
|
||||
)
|
||||
)
|
||||
|
||||
if not rows:
|
||||
return _empty_listed_candidates()
|
||||
|
||||
return (
|
||||
pl.DataFrame(
|
||||
rows,
|
||||
schema=[
|
||||
"postcode",
|
||||
"_listed_match_name",
|
||||
"_listed_grade",
|
||||
"_listed_entry",
|
||||
],
|
||||
orient="row",
|
||||
)
|
||||
.cast(_listed_candidate_schema())
|
||||
.unique(["postcode", "_listed_match_name", "_listed_entry"])
|
||||
)
|
||||
|
||||
|
||||
def _matched_listed_building_flags(
|
||||
properties: pl.LazyFrame,
|
||||
listed_candidates: pl.DataFrame,
|
||||
*,
|
||||
min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
|
||||
) -> pl.DataFrame:
|
||||
"""Return property keys that conservatively match an NHLE listed entry."""
|
||||
if listed_candidates.is_empty():
|
||||
return _empty_listed_property_flags()
|
||||
|
||||
candidate_postcodes = listed_candidates.select("postcode").unique()
|
||||
property_candidates = (
|
||||
properties.select("postcode", "pp_address", "epc_address")
|
||||
.join(candidate_postcodes.lazy(), on="postcode", how="semi")
|
||||
.with_columns(
|
||||
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
|
||||
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("pp_address").is_not_null()
|
||||
& (
|
||||
pl.col("_pp_match_address").is_not_null()
|
||||
| pl.col("_epc_match_address").is_not_null()
|
||||
)
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
if property_candidates.is_empty():
|
||||
return _empty_listed_property_flags()
|
||||
|
||||
listed_by_postcode: dict[str, list[str]] = {}
|
||||
for postcode, name in listed_candidates.select(
|
||||
"postcode", "_listed_match_name"
|
||||
).iter_rows():
|
||||
if postcode and name:
|
||||
listed_by_postcode.setdefault(postcode, []).append(name)
|
||||
|
||||
matches: list[tuple[str, str, str]] = []
|
||||
for row in property_candidates.iter_rows(named=True):
|
||||
postcode = row["postcode"]
|
||||
listed_names = listed_by_postcode.get(postcode)
|
||||
if not listed_names:
|
||||
continue
|
||||
|
||||
address_keys = []
|
||||
for col in ("_pp_match_address", "_epc_match_address"):
|
||||
value = row.get(col)
|
||||
if value and value not in address_keys:
|
||||
address_keys.append(value)
|
||||
|
||||
matched = False
|
||||
for address_key in address_keys:
|
||||
for listed_name in listed_names:
|
||||
if not _numbers_compatible(address_key, listed_name):
|
||||
continue
|
||||
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if matched:
|
||||
matches.append((postcode, row["pp_address"], "Yes"))
|
||||
|
||||
if not matches:
|
||||
return _empty_listed_property_flags()
|
||||
|
||||
return (
|
||||
pl.DataFrame(
|
||||
matches,
|
||||
schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
|
||||
orient="row",
|
||||
)
|
||||
.cast(
|
||||
{
|
||||
"postcode": pl.Utf8,
|
||||
"pp_address": pl.Utf8,
|
||||
LISTED_BUILDING_FEATURE: pl.Utf8,
|
||||
}
|
||||
)
|
||||
.unique(["postcode", "pp_address"])
|
||||
)
|
||||
|
||||
|
||||
def _listed_building_flags(
|
||||
properties: pl.LazyFrame,
|
||||
active_postcodes: pl.DataFrame,
|
||||
listed_buildings_path: Path,
|
||||
) -> pl.DataFrame:
|
||||
print(f"Loading listed-building points from {listed_buildings_path}...")
|
||||
listed_points = _load_listed_building_points(listed_buildings_path)
|
||||
print(f"Loaded {listed_points.height} listed-building point records")
|
||||
listed_candidates = _postcode_listed_building_candidates(
|
||||
listed_points, active_postcodes
|
||||
)
|
||||
print(
|
||||
"Matching listed-building names to property addresses across "
|
||||
f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
|
||||
)
|
||||
flags = _matched_listed_building_flags(properties, listed_candidates)
|
||||
print(f"Matched {flags.height} property addresses to listed-building entries")
|
||||
return flags
|
||||
|
||||
|
||||
def _normalise_crs(crs: object | None) -> str:
|
||||
return str(crs) if crs else "EPSG:4326"
|
||||
|
||||
|
||||
def _load_conservation_area_geometries(
|
||||
conservation_areas_path: Path,
|
||||
) -> tuple[list[BaseGeometry], str]:
|
||||
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
|
||||
geometry_name = metadata.get("geometry_name") or table.column_names[-1]
|
||||
geometries = []
|
||||
for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
|
||||
if geom is not None and not geom.is_empty:
|
||||
geometries.append(geom)
|
||||
if not geometries:
|
||||
raise ValueError(
|
||||
f"{conservation_areas_path} does not contain any usable polygon geometries"
|
||||
)
|
||||
return geometries, _normalise_crs(metadata.get("crs"))
|
||||
|
||||
|
||||
def _postcode_conservation_area_flags(
|
||||
postcodes: pl.DataFrame,
|
||||
conservation_geometries: list[BaseGeometry],
|
||||
conservation_crs: object | None,
|
||||
batch_size: int = 100_000,
|
||||
) -> pl.DataFrame:
|
||||
required = {"postcode", "lat", "lon"}
|
||||
missing = sorted(required - set(postcodes.columns))
|
||||
if missing:
|
||||
raise ValueError(f"Postcode data missing required columns: {missing}")
|
||||
|
||||
all_postcodes = postcodes.select("postcode").drop_nulls().unique()
|
||||
valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
|
||||
if valid_points.is_empty():
|
||||
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
|
||||
|
||||
lat = valid_points["lat"].to_numpy()
|
||||
lon = valid_points["lon"].to_numpy()
|
||||
finite = np.isfinite(lat) & np.isfinite(lon)
|
||||
valid_points = valid_points.filter(pl.Series(finite))
|
||||
if valid_points.is_empty():
|
||||
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
|
||||
|
||||
lat = valid_points["lat"].to_numpy()
|
||||
lon = valid_points["lon"].to_numpy()
|
||||
transformer = Transformer.from_crs(
|
||||
"EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
|
||||
)
|
||||
x, y = transformer.transform(lon, lat)
|
||||
|
||||
tree = STRtree(conservation_geometries)
|
||||
inside = np.zeros(valid_points.height, dtype=bool)
|
||||
for start in range(0, valid_points.height, batch_size):
|
||||
end = min(start + batch_size, valid_points.height)
|
||||
point_batch = points(x[start:end], y[start:end])
|
||||
matches = tree.query(point_batch, predicate="intersects")
|
||||
if matches.size > 0:
|
||||
inside[start + matches[0]] = True
|
||||
|
||||
matched = (
|
||||
valid_points.select("postcode")
|
||||
.with_columns(pl.Series("_within_conservation_area", inside))
|
||||
.group_by("postcode")
|
||||
.agg(pl.col("_within_conservation_area").max())
|
||||
.with_columns(
|
||||
pl.when(pl.col("_within_conservation_area"))
|
||||
.then(pl.lit("Yes"))
|
||||
.otherwise(pl.lit("No"))
|
||||
.alias(CONSERVATION_AREA_FEATURE)
|
||||
)
|
||||
.select("postcode", CONSERVATION_AREA_FEATURE)
|
||||
)
|
||||
return (
|
||||
all_postcodes.join(matched, on="postcode", how="left")
|
||||
.with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
|
||||
.select("postcode", CONSERVATION_AREA_FEATURE)
|
||||
)
|
||||
|
||||
|
||||
def _conservation_area_by_postcode(
|
||||
postcodes: pl.LazyFrame,
|
||||
conservation_areas_path: Path,
|
||||
) -> pl.LazyFrame:
|
||||
print(f"Loading conservation area polygons from {conservation_areas_path}...")
|
||||
geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
|
||||
postcode_points = postcodes.select("postcode", "lat", "lon").collect(
|
||||
engine="streaming"
|
||||
)
|
||||
print(
|
||||
"Computing conservation area membership for "
|
||||
f"{postcode_points.height} active English postcodes..."
|
||||
)
|
||||
return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
|
||||
|
||||
|
||||
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
||||
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
||||
non_null_count = pl.col(column).count()
|
||||
|
|
@ -234,11 +649,13 @@ def _build(
|
|||
noise_path: Path,
|
||||
school_proximity_path: Path,
|
||||
broadband_path: Path,
|
||||
conservation_areas_path: Path,
|
||||
rental_prices_path: Path,
|
||||
lsoa_population_path: Path,
|
||||
median_age_path: Path,
|
||||
election_results_path: Path,
|
||||
tree_density_postcodes_path: Path | None = None,
|
||||
listed_buildings_path: Path | None = None,
|
||||
) -> tuple[pl.DataFrame, pl.DataFrame]:
|
||||
"""Build postcode and properties dataframes from epc_pp + auxiliary data.
|
||||
|
||||
|
|
@ -273,6 +690,29 @@ def _build(
|
|||
).unique(["postcode"])
|
||||
wide = wide.join(postcode_country, on="postcode", how="left")
|
||||
|
||||
if listed_buildings_path is not None:
|
||||
active_postcodes_for_listed = (
|
||||
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
|
||||
.filter(pl.col("doterm").is_null())
|
||||
.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
"east1m",
|
||||
"north1m",
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
listed_flags = _listed_building_flags(
|
||||
wide.select("postcode", "pp_address", "epc_address"),
|
||||
active_postcodes_for_listed,
|
||||
listed_buildings_path,
|
||||
)
|
||||
wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
|
||||
else:
|
||||
wide = wide.with_columns(
|
||||
pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
|
||||
)
|
||||
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
|
||||
|
||||
arcgis = (
|
||||
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
|
||||
.filter(pl.col("doterm").is_null()) # Active postcodes only
|
||||
|
|
@ -382,6 +822,13 @@ def _build(
|
|||
school_proximity = pl.scan_parquet(school_proximity_path)
|
||||
wide = wide.join(school_proximity, on="postcode", how="left")
|
||||
|
||||
conservation_areas = _conservation_area_by_postcode(
|
||||
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
|
||||
)
|
||||
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
|
||||
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
|
||||
)
|
||||
|
||||
if tree_density_postcodes_path is not None:
|
||||
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
|
||||
wide = wide.join(tree_density, on="postcode", how="left")
|
||||
|
|
@ -476,7 +923,6 @@ def _build(
|
|||
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
|
||||
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
|
||||
"Barriers to Housing and Services Score",
|
||||
"lsoa21",
|
||||
"oa21",
|
||||
"pcon",
|
||||
"epc_property_type",
|
||||
|
|
@ -598,6 +1044,18 @@ def main():
|
|||
required=True,
|
||||
help="Broadband performance by output area parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--conservation-areas",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Historic England conservation areas GeoPackage",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--listed-buildings",
|
||||
type=Path,
|
||||
required=False,
|
||||
help="Historic England NHLE listed-building points GeoPackage",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rental-prices",
|
||||
type=Path,
|
||||
|
|
@ -652,11 +1110,13 @@ def main():
|
|||
noise_path=args.noise,
|
||||
school_proximity_path=args.school_proximity,
|
||||
broadband_path=args.broadband,
|
||||
conservation_areas_path=args.conservation_areas,
|
||||
rental_prices_path=args.rental_prices,
|
||||
lsoa_population_path=args.lsoa_population,
|
||||
median_age_path=args.median_age,
|
||||
election_results_path=args.election_results,
|
||||
tree_density_postcodes_path=args.tree_density_postcodes,
|
||||
listed_buildings_path=args.listed_buildings,
|
||||
)
|
||||
|
||||
print(f"\nPostcode columns: {postcode_df.columns}")
|
||||
|
|
|
|||
398
pipeline/transform/noise_overlay_tiles.py
Normal file
398
pipeline/transform/noise_overlay_tiles.py
Normal file
|
|
@ -0,0 +1,398 @@
|
|||
"""Build PMTiles raster tiles for the high-resolution Defra noise overlay.
|
||||
|
||||
This keeps the native 10m strategic-noise rasters as the source of truth and
|
||||
renders transparent PNG XYZ tiles into MBTiles before converting to PMTiles.
|
||||
The dashboard serves the resulting archive through /api/overlays/noise.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import math
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import rasterio
|
||||
from PIL import Image
|
||||
from rasterio.enums import Resampling
|
||||
from rasterio.transform import from_bounds
|
||||
from rasterio.warp import reproject, transform_bounds
|
||||
from shapely import STRtree, box
|
||||
|
||||
from pipeline.download.noise import (
|
||||
BNG_MAX_E,
|
||||
BNG_MAX_N,
|
||||
BNG_MIN_E,
|
||||
BNG_MIN_N,
|
||||
NOISE_SOURCES,
|
||||
download_raster,
|
||||
)
|
||||
from pipeline.download.tiles import ensure_pmtiles_cli
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
|
||||
WEB_MERCATOR_CRS = "EPSG:3857"
|
||||
WEB_MERCATOR_EXTENT = 20_037_508.342789244
|
||||
DEFAULT_SOURCE_NAMES = ("road", "rail", "airport")
|
||||
NOISE_COLOR_STOPS = np.array([45.0, 55.0, 65.0, 75.0], dtype=np.float32)
|
||||
NOISE_COLORS = np.array(
|
||||
[
|
||||
[254, 240, 138],
|
||||
[251, 146, 60],
|
||||
[220, 38, 38],
|
||||
[127, 29, 29],
|
||||
],
|
||||
dtype=np.float32,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RasterInfo:
|
||||
path: Path
|
||||
bounds_mercator: tuple[float, float, float, float]
|
||||
|
||||
|
||||
def _source_specs(source_names: tuple[str, ...]):
|
||||
requested = {name.lower() for name in source_names}
|
||||
if "all" in requested:
|
||||
requested = set(DEFAULT_SOURCE_NAMES)
|
||||
|
||||
by_name = {label.lower(): spec for label, *spec in NOISE_SOURCES}
|
||||
unknown = sorted(requested - set(by_name))
|
||||
if unknown:
|
||||
raise ValueError(f"Unknown noise source(s): {', '.join(unknown)}")
|
||||
|
||||
return [
|
||||
(name.title(), *by_name[name])
|
||||
for name in DEFAULT_SOURCE_NAMES
|
||||
if name in requested
|
||||
]
|
||||
|
||||
|
||||
def _download_source_rasters(
|
||||
raster_dir: Path,
|
||||
source_names: tuple[str, ...],
|
||||
) -> list[Path]:
|
||||
paths: list[Path] = []
|
||||
raster_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for (
|
||||
label,
|
||||
_col_name,
|
||||
wcs_base,
|
||||
coverage_id,
|
||||
wcs_version,
|
||||
allow_missing_tiles,
|
||||
) in _source_specs(source_names):
|
||||
tile_dir = raster_dir / label.lower()
|
||||
tile_dir.mkdir(parents=True, exist_ok=True)
|
||||
paths.extend(
|
||||
download_raster(
|
||||
tile_dir,
|
||||
wcs_base,
|
||||
coverage_id,
|
||||
label,
|
||||
wcs_version,
|
||||
allow_missing_tiles,
|
||||
)
|
||||
)
|
||||
|
||||
return paths
|
||||
|
||||
|
||||
def _raster_infos(raster_paths: list[Path]) -> list[RasterInfo]:
|
||||
infos: list[RasterInfo] = []
|
||||
for path in raster_paths:
|
||||
with rasterio.open(path) as dataset:
|
||||
if dataset.crs is None:
|
||||
raise ValueError(f"Raster has no CRS: {path}")
|
||||
bounds = transform_bounds(
|
||||
dataset.crs,
|
||||
WEB_MERCATOR_CRS,
|
||||
*dataset.bounds,
|
||||
densify_pts=21,
|
||||
)
|
||||
infos.append(RasterInfo(path=path, bounds_mercator=bounds))
|
||||
return infos
|
||||
|
||||
|
||||
def _england_bounds_wgs84() -> tuple[float, float, float, float]:
|
||||
return transform_bounds(
|
||||
"EPSG:27700",
|
||||
"EPSG:4326",
|
||||
BNG_MIN_E,
|
||||
BNG_MIN_N,
|
||||
BNG_MAX_E,
|
||||
BNG_MAX_N,
|
||||
densify_pts=21,
|
||||
)
|
||||
|
||||
|
||||
def _lonlat_to_tile(lon: float, lat: float, zoom: int) -> tuple[int, int]:
|
||||
lat = max(min(lat, 85.05112878), -85.05112878)
|
||||
n = 1 << zoom
|
||||
x = int(math.floor((lon + 180.0) / 360.0 * n))
|
||||
y = int(
|
||||
math.floor((1.0 - math.asinh(math.tan(math.radians(lat))) / math.pi) / 2.0 * n)
|
||||
)
|
||||
return min(max(x, 0), n - 1), min(max(y, 0), n - 1)
|
||||
|
||||
|
||||
def _tile_bounds_mercator(
|
||||
zoom: int, x: int, y: int
|
||||
) -> tuple[float, float, float, float]:
|
||||
n = 1 << zoom
|
||||
tile_size_m = WEB_MERCATOR_EXTENT * 2 / n
|
||||
left = -WEB_MERCATOR_EXTENT + x * tile_size_m
|
||||
right = left + tile_size_m
|
||||
top = WEB_MERCATOR_EXTENT - y * tile_size_m
|
||||
bottom = top - tile_size_m
|
||||
return left, bottom, right, top
|
||||
|
||||
|
||||
def _read_noise_tile(
|
||||
candidates: list[RasterInfo],
|
||||
bounds_mercator: tuple[float, float, float, float],
|
||||
tile_size: int,
|
||||
) -> np.ndarray:
|
||||
left, bottom, right, top = bounds_mercator
|
||||
merged = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
|
||||
|
||||
for info in candidates:
|
||||
with rasterio.open(info.path) as source:
|
||||
tile = np.full((tile_size, tile_size), np.nan, dtype=np.float32)
|
||||
reproject(
|
||||
source=rasterio.band(source, 1),
|
||||
destination=tile,
|
||||
src_transform=source.transform,
|
||||
src_crs=source.crs,
|
||||
src_nodata=source.nodata if source.nodata is not None else 0,
|
||||
dst_transform=from_bounds(
|
||||
left, bottom, right, top, tile_size, tile_size
|
||||
),
|
||||
dst_crs=WEB_MERCATOR_CRS,
|
||||
dst_nodata=np.nan,
|
||||
resampling=Resampling.bilinear,
|
||||
)
|
||||
|
||||
tile[~np.isfinite(tile) | (tile <= 0)] = np.nan
|
||||
merged = np.fmax(merged, tile)
|
||||
|
||||
return merged
|
||||
|
||||
|
||||
def _encode_noise_png(noise_db: np.ndarray) -> bytes | None:
|
||||
valid = np.isfinite(noise_db) & (noise_db >= NOISE_COLOR_STOPS[0])
|
||||
if not valid.any():
|
||||
return None
|
||||
|
||||
clipped = np.clip(noise_db, NOISE_COLOR_STOPS[0], NOISE_COLOR_STOPS[-1])
|
||||
rgba = np.zeros((*noise_db.shape, 4), dtype=np.uint8)
|
||||
valid_values = clipped[valid]
|
||||
|
||||
for channel in range(3):
|
||||
channel_values = np.interp(
|
||||
valid_values,
|
||||
NOISE_COLOR_STOPS,
|
||||
NOISE_COLORS[:, channel],
|
||||
).astype(np.uint8)
|
||||
rgba[..., channel][valid] = channel_values
|
||||
|
||||
alpha = np.interp(
|
||||
valid_values,
|
||||
[NOISE_COLOR_STOPS[0], NOISE_COLOR_STOPS[-1]],
|
||||
[70, 190],
|
||||
).astype(np.uint8)
|
||||
rgba[..., 3][valid] = alpha
|
||||
|
||||
output = io.BytesIO()
|
||||
Image.fromarray(rgba, mode="RGBA").save(output, format="PNG", optimize=True)
|
||||
return output.getvalue()
|
||||
|
||||
|
||||
def _tile_ranges(
|
||||
bounds_wgs84: tuple[float, float, float, float],
|
||||
zoom: int,
|
||||
) -> tuple[range, range]:
|
||||
west, south, east, north = bounds_wgs84
|
||||
min_x, min_y = _lonlat_to_tile(west, north, zoom)
|
||||
max_x, max_y = _lonlat_to_tile(east, south, zoom)
|
||||
return range(min_x, max_x + 1), range(min_y, max_y + 1)
|
||||
|
||||
|
||||
def _create_mbtiles(
|
||||
raster_infos: list[RasterInfo],
|
||||
mbtiles_path: Path,
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
tile_size: int,
|
||||
) -> int:
|
||||
if mbtiles_path.exists():
|
||||
mbtiles_path.unlink()
|
||||
|
||||
bounds_wgs84 = _england_bounds_wgs84()
|
||||
geometries = [box(*info.bounds_mercator) for info in raster_infos]
|
||||
tree = STRtree(geometries)
|
||||
|
||||
conn = sqlite3.connect(mbtiles_path)
|
||||
conn.execute("CREATE TABLE metadata (name TEXT, value TEXT)")
|
||||
conn.execute(
|
||||
"CREATE TABLE tiles (zoom_level INTEGER, tile_column INTEGER, "
|
||||
"tile_row INTEGER, tile_data BLOB)"
|
||||
)
|
||||
conn.execute(
|
||||
"CREATE UNIQUE INDEX tile_index ON tiles (zoom_level, tile_column, tile_row)"
|
||||
)
|
||||
conn.executemany(
|
||||
"INSERT INTO metadata (name, value) VALUES (?, ?)",
|
||||
[
|
||||
("name", "Defra Lden noise overlay"),
|
||||
("type", "overlay"),
|
||||
("version", "1"),
|
||||
("description", "Defra Round 4 10m strategic noise Lden overlay"),
|
||||
("format", "png"),
|
||||
(
|
||||
"attribution",
|
||||
"Contains public sector information licensed under the OGL v3.0",
|
||||
),
|
||||
("bounds", ",".join(f"{value:.6f}" for value in bounds_wgs84)),
|
||||
("minzoom", str(min_zoom)),
|
||||
("maxzoom", str(max_zoom)),
|
||||
],
|
||||
)
|
||||
|
||||
total_tiles = 0
|
||||
try:
|
||||
for zoom in range(min_zoom, max_zoom + 1):
|
||||
x_range, y_range = _tile_ranges(bounds_wgs84, zoom)
|
||||
zoom_tiles = 0
|
||||
for x in x_range:
|
||||
for y in y_range:
|
||||
bounds_mercator = _tile_bounds_mercator(zoom, x, y)
|
||||
candidate_indexes = tree.query(box(*bounds_mercator))
|
||||
if len(candidate_indexes) == 0:
|
||||
continue
|
||||
|
||||
candidates = [
|
||||
raster_infos[int(index)] for index in candidate_indexes
|
||||
]
|
||||
tile = _read_noise_tile(candidates, bounds_mercator, tile_size)
|
||||
tile_png = _encode_noise_png(tile)
|
||||
if tile_png is None:
|
||||
continue
|
||||
|
||||
tms_y = (1 << zoom) - 1 - y
|
||||
conn.execute(
|
||||
"INSERT INTO tiles VALUES (?, ?, ?, ?)",
|
||||
(zoom, x, tms_y, tile_png),
|
||||
)
|
||||
zoom_tiles += 1
|
||||
total_tiles += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Zoom {zoom}: wrote {zoom_tiles:,} PNG tiles")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return total_tiles
|
||||
|
||||
|
||||
def build_noise_overlay_tiles(
|
||||
output_path: Path,
|
||||
raster_dir: Path,
|
||||
source_names: tuple[str, ...],
|
||||
input_rasters: tuple[Path, ...],
|
||||
pmtiles_bin: Path,
|
||||
pmtiles_version: str,
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
tile_size: int,
|
||||
) -> None:
|
||||
if min_zoom > max_zoom:
|
||||
raise ValueError("--min-zoom must be <= --max-zoom")
|
||||
|
||||
raster_paths = list(input_rasters) or _download_source_rasters(
|
||||
raster_dir, source_names
|
||||
)
|
||||
if not raster_paths:
|
||||
raise FileNotFoundError("No noise raster GeoTIFFs available")
|
||||
|
||||
print(f"Preparing {len(raster_paths):,} noise raster tile(s)")
|
||||
raster_infos = _raster_infos(raster_paths)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
ensure_pmtiles_cli(pmtiles_bin, pmtiles_version)
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||
mbtiles_path = Path(tmp) / "noise_lden_10m.mbtiles"
|
||||
tile_count = _create_mbtiles(
|
||||
raster_infos, mbtiles_path, min_zoom, max_zoom, tile_size
|
||||
)
|
||||
if tile_count == 0:
|
||||
raise RuntimeError("Noise overlay generation produced no tiles")
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
str(pmtiles_bin),
|
||||
"convert",
|
||||
str(mbtiles_path),
|
||||
str(output_path),
|
||||
"--force",
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {output_path} ({size_mb:.1f} MB)")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
parser.add_argument(
|
||||
"--raster-dir",
|
||||
type=Path,
|
||||
default=Path("property-data/noise_overlay_rasters"),
|
||||
help="Cache directory for downloaded Defra WCS GeoTIFF tiles",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--source",
|
||||
action="append",
|
||||
dest="sources",
|
||||
choices=("all", *DEFAULT_SOURCE_NAMES),
|
||||
help="Noise source to include; repeatable. Defaults to all.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-raster",
|
||||
action="append",
|
||||
dest="input_rasters",
|
||||
type=Path,
|
||||
help="Existing GeoTIFF to render instead of downloading WCS rasters",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pmtiles-bin", type=Path, default=Path("property-data/pmtiles")
|
||||
)
|
||||
parser.add_argument("--pmtiles-version", default="1.22.3")
|
||||
parser.add_argument("--min-zoom", type=int, default=13)
|
||||
parser.add_argument("--max-zoom", type=int, default=14)
|
||||
parser.add_argument("--tile-size", type=int, default=256)
|
||||
args = parser.parse_args()
|
||||
|
||||
build_noise_overlay_tiles(
|
||||
output_path=args.output,
|
||||
raster_dir=args.raster_dir,
|
||||
source_names=tuple(args.sources or ("all",)),
|
||||
input_rasters=tuple(args.input_rasters or ()),
|
||||
pmtiles_bin=args.pmtiles_bin,
|
||||
pmtiles_version=args.pmtiles_version,
|
||||
min_zoom=args.min_zoom,
|
||||
max_zoom=args.max_zoom,
|
||||
tile_size=args.tile_size,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -39,6 +39,8 @@ def main():
|
|||
pl.col("Ofsted phase").is_in(["Primary", "Secondary"])
|
||||
& pl.col("Latest OEIF overall effectiveness").is_in(["1", "2"])
|
||||
)
|
||||
if ofsted.is_empty():
|
||||
raise ValueError("No good+ primary/secondary Ofsted schools found")
|
||||
|
||||
print(f"Good+ schools: {len(ofsted):,}")
|
||||
print(
|
||||
|
|
@ -74,6 +76,8 @@ def main():
|
|||
)
|
||||
|
||||
schools = ofsted.join(arcgis, on="postcode", how="inner")
|
||||
if schools.is_empty():
|
||||
raise ValueError("No Ofsted schools matched ArcGIS postcode coordinates")
|
||||
print(f"Schools with coordinates: {len(schools):,}")
|
||||
|
||||
# Load all postcodes for proximity counting
|
||||
|
|
@ -88,6 +92,7 @@ def main():
|
|||
|
||||
result = counts_5km.join(counts_2km, on="postcode")
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
result.write_parquet(args.output)
|
||||
size_mb = args.output.stat().st_size / (1024 * 1024)
|
||||
print(f"Wrote {args.output} ({size_mb:.1f} MB)")
|
||||
|
|
|
|||
|
|
@ -95,6 +95,69 @@ def test_transform_crime_annualises_over_all_valid_months(tmp_path):
|
|||
]
|
||||
|
||||
|
||||
def test_transform_crime_writes_by_year_output(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
jan23 = crime_dir / "2023-01"
|
||||
jan24 = crime_dir / "2024-01"
|
||||
feb24 = crime_dir / "2024-02"
|
||||
for d in (jan23, jan24, feb24):
|
||||
d.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
(jan23 / "2023-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"1,2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
"2,2023-01,F,F,-0.1,51.5,X,E01000001,L,Robbery,U,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
(jan24 / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"3,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
"4,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
(feb24 / "2024-02-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"5,2024-02,F,F,-0.1,51.5,X,E01000001,L,Anti-social behaviour,U,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
by_year_output = tmp_path / "crime_by_year.parquet"
|
||||
transform_crime(crime_dir, output, by_year_output)
|
||||
|
||||
by_year = pl.read_parquet(by_year_output)
|
||||
assert by_year.height == 1
|
||||
cols = set(by_year.columns)
|
||||
assert "Burglary (by year)" in cols
|
||||
assert "Serious crime (by year)" in cols
|
||||
assert "Minor crime (by year)" in cols
|
||||
|
||||
row = by_year.row(0, named=True)
|
||||
burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
|
||||
# 2023: 1 burglary in 1 month → 12/yr; 2024: 2 in 2 months → 12/yr
|
||||
assert burglary == [
|
||||
{"year": 2023, "count": 12.0},
|
||||
{"year": 2024, "count": 12.0},
|
||||
]
|
||||
# Serious crime in 2023 = Burglary(12) + Robbery(12) = 24
|
||||
serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
|
||||
assert serious[2023] == 24.0
|
||||
assert serious[2024] == 12.0
|
||||
|
||||
|
||||
def test_transform_crime_fails_without_valid_months(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
|
|
@ -117,3 +180,49 @@ def test_transform_crime_fails_without_valid_months(tmp_path):
|
|||
assert "No valid crime months" in str(exc)
|
||||
else:
|
||||
raise AssertionError("Expected ValueError")
|
||||
|
||||
|
||||
def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
month_dir.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
# E01000001 was split into two 2021 LSOAs; E01000099 is unchanged.
|
||||
(month_dir / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[
|
||||
header,
|
||||
"1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
"2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
|
||||
"3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
|
||||
lookup_path = tmp_path / "lookup.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"lsoa11": ["E01000001", "E01000001", "E01000099"],
|
||||
"lsoa21": ["E01000050", "E01000051", "E01000099"],
|
||||
}
|
||||
).write_parquet(lookup_path)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
by_year_output = tmp_path / "by_year.parquet"
|
||||
transform_crime(crime_dir, output, by_year_output, lookup_path)
|
||||
|
||||
# Split LSOA: 2 burglaries split evenly → 1/yr each child, annualised to 12/yr each.
|
||||
avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
|
||||
assert avg == [
|
||||
{"LSOA code": "E01000050", "Burglary (avg/yr)": 12.0},
|
||||
{"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
|
||||
{"LSOA code": "E01000099", "Burglary (avg/yr)": 12.0},
|
||||
]
|
||||
|
||||
by_year = pl.read_parquet(by_year_output).sort("LSOA code").to_dicts()
|
||||
burglaries = {row["LSOA code"]: row["Burglary (by year)"] for row in by_year}
|
||||
assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
|
||||
assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
|
||||
assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]
|
||||
|
|
|
|||
|
|
@ -1,11 +1,17 @@
|
|||
import polars as pl
|
||||
import pytest
|
||||
from shapely import box
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_AREA_COLUMNS,
|
||||
CONSERVATION_AREA_FEATURE,
|
||||
LISTED_BUILDING_FEATURE,
|
||||
TREE_DENSITY_FEATURE,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
_matched_listed_building_flags,
|
||||
_postcode_conservation_area_flags,
|
||||
_postcode_listed_building_candidates,
|
||||
_tree_density_by_postcode,
|
||||
_validate_lad_source_coverage,
|
||||
_validate_property_postcodes,
|
||||
|
|
@ -48,6 +54,106 @@ def test_country_code_is_kept_in_postcode_area_columns() -> None:
|
|||
assert "ctry25cd" in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_conservation_area_feature_is_area_level() -> None:
|
||||
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_listed_building_feature_is_property_level() -> None:
|
||||
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_postcode_conservation_area_flags_marks_point_membership() -> None:
|
||||
postcodes = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
|
||||
"lat": [0.5, 2.0, None],
|
||||
"lon": [0.5, 2.0, 0.5],
|
||||
}
|
||||
)
|
||||
|
||||
result = _postcode_conservation_area_flags(
|
||||
postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
|
||||
).sort("postcode")
|
||||
|
||||
assert result.to_dicts() == [
|
||||
{"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
|
||||
{"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
|
||||
{"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
|
||||
]
|
||||
|
||||
|
||||
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
|
||||
listed_points = pl.DataFrame(
|
||||
{
|
||||
"ListEntry": [1234, 5678],
|
||||
"Name": ["1 and 2 High Street", "Distant Hall"],
|
||||
"Grade": ["II", "I"],
|
||||
"Easting": [100.0, 1000.0],
|
||||
"Northing": [100.0, 1000.0],
|
||||
}
|
||||
).with_columns(
|
||||
pl.col("Name")
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^0-9A-Z]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
.alias("_listed_match_name")
|
||||
)
|
||||
active_postcodes = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB1 1BB"],
|
||||
"east1m": [105.0, 5000.0],
|
||||
"north1m": [105.0, 5000.0],
|
||||
}
|
||||
)
|
||||
|
||||
result = _postcode_listed_building_candidates(
|
||||
listed_points,
|
||||
active_postcodes,
|
||||
nearest_postcodes=1,
|
||||
max_distance_m=25,
|
||||
)
|
||||
|
||||
assert result.select("postcode", "_listed_match_name").to_dicts() == [
|
||||
{"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
|
||||
]
|
||||
|
||||
|
||||
def test_matched_listed_building_flags_requires_address_match() -> None:
|
||||
properties = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
|
||||
"pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
|
||||
"epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
|
||||
}
|
||||
)
|
||||
listed_candidates = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB1 1BB"],
|
||||
"_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
|
||||
"_listed_grade": ["II", "II*"],
|
||||
"_listed_entry": [1234, 5678],
|
||||
}
|
||||
)
|
||||
|
||||
result = _matched_listed_building_flags(
|
||||
properties.lazy(), listed_candidates, min_score=95
|
||||
).sort("postcode", "pp_address")
|
||||
|
||||
assert result.to_dicts() == [
|
||||
{
|
||||
"postcode": "AA1 1AA",
|
||||
"pp_address": "1 HIGH STREET",
|
||||
LISTED_BUILDING_FEATURE: "Yes",
|
||||
},
|
||||
{
|
||||
"postcode": "BB1 1BB",
|
||||
"pp_address": "THE OLD RECTORY",
|
||||
LISTED_BUILDING_FEATURE: "Yes",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def test_validate_property_postcodes_rejects_blank_rows() -> None:
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -182,6 +182,19 @@ DROP_CATEGORIES = {
|
|||
"public_transport/platform",
|
||||
"public_transport/station",
|
||||
"public_transport/stop_position",
|
||||
# Education amenities — schools come from GIAS instead. OSM coverage for
|
||||
# tertiary education, tutoring, and childcare is too noisy/incomplete to be
|
||||
# useful on a property-search map.
|
||||
"amenity/school",
|
||||
"amenity/prep_school",
|
||||
"amenity/language_school",
|
||||
"amenity/music_school",
|
||||
"amenity/university",
|
||||
"amenity/college",
|
||||
"building/university",
|
||||
"amenity/kindergarten",
|
||||
"amenity/childcare",
|
||||
"office/tutoring",
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -943,23 +956,10 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
"tourism/chalet",
|
||||
],
|
||||
),
|
||||
(
|
||||
"Education",
|
||||
"School",
|
||||
"🏫",
|
||||
[
|
||||
"amenity/school",
|
||||
"amenity/prep_school",
|
||||
"amenity/language_school",
|
||||
"amenity/music_school",
|
||||
"amenity/university",
|
||||
"amenity/college",
|
||||
"building/university",
|
||||
"amenity/kindergarten",
|
||||
"amenity/childcare",
|
||||
"office/tutoring",
|
||||
],
|
||||
),
|
||||
# Note: schools come from the GIAS register (see transform_gias_schools).
|
||||
# Niche/tertiary education amenities that GIAS does not cover are dropped
|
||||
# rather than mixed in with state-funded schools.
|
||||
|
||||
(
|
||||
"Local Businesses",
|
||||
"Hotel",
|
||||
|
|
@ -1316,11 +1316,45 @@ def transform_grocery_retail_points(
|
|||
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
|
||||
|
||||
|
||||
def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
|
||||
"""Convert the GIAS register parquet into POI rows with school metadata."""
|
||||
return pl.scan_parquet(gias_path).select(
|
||||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||||
pl.col("name"),
|
||||
pl.lit("School").alias("category"),
|
||||
pl.lit("School").alias("icon_category"),
|
||||
pl.lit("Education").alias("group"),
|
||||
pl.col("lat").cast(pl.Float64),
|
||||
pl.col("lng").cast(pl.Float64),
|
||||
pl.lit("🏫").alias("emoji"),
|
||||
pl.col("phase").alias("school_phase"),
|
||||
pl.col("type").alias("school_type"),
|
||||
pl.col("type_group").alias("school_type_group"),
|
||||
pl.col("age_range").alias("school_age_range"),
|
||||
pl.col("gender").alias("school_gender"),
|
||||
pl.col("religious_character").alias("school_religious_character"),
|
||||
pl.col("admissions_policy").alias("school_admissions_policy"),
|
||||
pl.col("nursery_provision").alias("school_nursery_provision"),
|
||||
pl.col("sixth_form").alias("school_sixth_form"),
|
||||
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
|
||||
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
|
||||
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
|
||||
pl.col("trust").alias("school_trust"),
|
||||
pl.col("address").alias("school_address"),
|
||||
pl.col("postcode").alias("school_postcode"),
|
||||
pl.col("local_authority").alias("school_local_authority"),
|
||||
pl.col("website").alias("school_website"),
|
||||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||||
pl.col("head_name").alias("school_head_name"),
|
||||
)
|
||||
|
||||
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path | None = None,
|
||||
boundary_path: Path | None = None,
|
||||
grocery_retail_points_path: Path | None = None,
|
||||
naptan_path: Path,
|
||||
boundary_path: Path,
|
||||
grocery_retail_points_path: Path,
|
||||
gias_path: Path,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
|
|
@ -1372,24 +1406,21 @@ def transform(
|
|||
)
|
||||
|
||||
naptan_df = pl.scan_parquet(naptan_path).collect()
|
||||
if boundary_path is not None:
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
naptan_df["lat"].to_numpy(),
|
||||
naptan_df["lng"].to_numpy(),
|
||||
)
|
||||
naptan_df = naptan_df.filter(pl.Series(mask))
|
||||
mask = in_england_mask(
|
||||
boundary_path,
|
||||
naptan_df["lat"].to_numpy(),
|
||||
naptan_df["lng"].to_numpy(),
|
||||
)
|
||||
naptan_df = naptan_df.filter(pl.Series(mask))
|
||||
naptan = naptan_df.lazy().with_columns(
|
||||
pl.col("category").replace_strict(NAPTAN_EMOJIS).alias("emoji"),
|
||||
pl.lit("Public Transport").alias("group"),
|
||||
pl.col("category").alias("icon_category"),
|
||||
)
|
||||
|
||||
frames = [lf, naptan]
|
||||
if grocery_retail_points_path is not None:
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
frames.append(grocery_pois.lazy())
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
||||
|
|
@ -1413,8 +1444,15 @@ def main():
|
|||
parser.add_argument(
|
||||
"--grocery-retail-points",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="GEOLYTIX Grocery Retail Points parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gias",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="GIAS schools register parquet (replaces OSM schools)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
|
|
@ -1425,6 +1463,7 @@ def main():
|
|||
args.naptan,
|
||||
args.boundary,
|
||||
args.grocery_retail_points,
|
||||
args.gias,
|
||||
).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
|
|
|||
269
pipeline/transform/tree_overlay_tiles.py
Normal file
269
pipeline/transform/tree_overlay_tiles.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pyogrio
|
||||
import shapely
|
||||
from pyproj import Transformer
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.tree_density import (
|
||||
DEFAULT_TOW_TYPES,
|
||||
_layers,
|
||||
_tow_dataset_path,
|
||||
_where_for_tow_types,
|
||||
)
|
||||
|
||||
|
||||
def _require_tippecanoe() -> str:
|
||||
executable = shutil.which("tippecanoe")
|
||||
if executable is None:
|
||||
raise RuntimeError(
|
||||
"tippecanoe is required to build tree overlay PMTiles. "
|
||||
"Install tippecanoe and rerun this target."
|
||||
)
|
||||
return executable
|
||||
|
||||
|
||||
def _column_or_none(batch, names: list[str], column: str):
|
||||
if column not in names:
|
||||
return None
|
||||
return batch.column(names.index(column)).to_numpy(zero_copy_only=False)
|
||||
|
||||
|
||||
def _number_or_none(value) -> float | int | None:
|
||||
if value is None:
|
||||
return None
|
||||
try:
|
||||
if np.isfinite(value):
|
||||
if float(value).is_integer():
|
||||
return int(value)
|
||||
return round(float(value), 2)
|
||||
except TypeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _write_tree_geojsonseq(
|
||||
dataset_path: str,
|
||||
output_path: Path,
|
||||
tow_types: tuple[str, ...],
|
||||
batch_size: int,
|
||||
layer_names: tuple[str, ...] | None,
|
||||
max_features_per_layer: int | None,
|
||||
) -> int:
|
||||
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
where = _where_for_tow_types(tow_types)
|
||||
layers = _layers(dataset_path, layer_names)
|
||||
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
||||
if where:
|
||||
print(f"TOW type filter: {where}")
|
||||
|
||||
columns = [
|
||||
"TOW_ID",
|
||||
"Woodland_Type",
|
||||
"TOW_Area_M",
|
||||
"MEANHT",
|
||||
"MINHT",
|
||||
"MAXHT",
|
||||
"LiDAR_Survey_Year",
|
||||
]
|
||||
feature_count = 0
|
||||
|
||||
with output_path.open("w") as file:
|
||||
for layer in layers:
|
||||
info = pyogrio.read_info(dataset_path, layer=layer)
|
||||
print(f"\nLayer {layer}: {info.get('features', 0):,} features")
|
||||
layer_features_seen = 0
|
||||
|
||||
with pyogrio.open_arrow(
|
||||
dataset_path,
|
||||
layer=layer,
|
||||
columns=columns,
|
||||
where=where,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=True,
|
||||
) as (_meta, reader):
|
||||
for batch in reader:
|
||||
if max_features_per_layer is not None:
|
||||
remaining = max_features_per_layer - layer_features_seen
|
||||
if remaining <= 0:
|
||||
break
|
||||
if batch.num_rows > remaining:
|
||||
batch = batch.slice(0, remaining)
|
||||
|
||||
layer_features_seen += batch.num_rows
|
||||
names = batch.schema.names
|
||||
area = np.asarray(
|
||||
batch.column(names.index("TOW_Area_M")).to_numpy(
|
||||
zero_copy_only=False
|
||||
),
|
||||
dtype=np.float64,
|
||||
)
|
||||
geometry = np.asarray(
|
||||
batch.column(names.index("SHAPE")).to_numpy(
|
||||
zero_copy_only=False
|
||||
),
|
||||
dtype=object,
|
||||
)
|
||||
valid = np.isfinite(area) & (area > 0)
|
||||
if not valid.any():
|
||||
continue
|
||||
|
||||
tow_id = _column_or_none(batch, names, "TOW_ID")
|
||||
woodland_type = _column_or_none(batch, names, "Woodland_Type")
|
||||
mean_height = _column_or_none(batch, names, "MEANHT")
|
||||
min_height = _column_or_none(batch, names, "MINHT")
|
||||
max_height = _column_or_none(batch, names, "MAXHT")
|
||||
lidar_year = _column_or_none(batch, names, "LiDAR_Survey_Year")
|
||||
|
||||
geometries = shapely.from_wkb(geometry[valid])
|
||||
geometries = shapely.transform(
|
||||
geometries,
|
||||
to_wgs84.transform,
|
||||
interleaved=False,
|
||||
)
|
||||
geometries_json = shapely.to_geojson(geometries)
|
||||
valid_indexes = np.flatnonzero(valid)
|
||||
|
||||
for idx, geometry_json in zip(valid_indexes, geometries_json):
|
||||
properties = {
|
||||
"tow_id": str(tow_id[idx]) if tow_id is not None else "",
|
||||
"woodland_type": (
|
||||
str(woodland_type[idx])
|
||||
if woodland_type is not None
|
||||
else ""
|
||||
),
|
||||
"area_sqm": _number_or_none(area[idx]),
|
||||
"mean_height_m": (
|
||||
_number_or_none(mean_height[idx])
|
||||
if mean_height is not None
|
||||
else None
|
||||
),
|
||||
"min_height_m": (
|
||||
_number_or_none(min_height[idx])
|
||||
if min_height is not None
|
||||
else None
|
||||
),
|
||||
"max_height_m": (
|
||||
_number_or_none(max_height[idx])
|
||||
if max_height is not None
|
||||
else None
|
||||
),
|
||||
"lidar_year": (
|
||||
_number_or_none(lidar_year[idx])
|
||||
if lidar_year is not None
|
||||
else None
|
||||
),
|
||||
"source_layer": layer,
|
||||
}
|
||||
feature = {
|
||||
"type": "Feature",
|
||||
"geometry": json.loads(geometry_json),
|
||||
"properties": properties,
|
||||
}
|
||||
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
|
||||
feature_count += 1
|
||||
|
||||
return feature_count
|
||||
|
||||
|
||||
def build_tree_overlay_tiles(
|
||||
tow_zip: Path,
|
||||
output_path: Path,
|
||||
extract_dir: Path,
|
||||
tow_types: tuple[str, ...],
|
||||
batch_size: int,
|
||||
layer_names: tuple[str, ...] | None,
|
||||
max_features_per_layer: int | None,
|
||||
min_zoom: int,
|
||||
max_zoom: int,
|
||||
force_extract: bool,
|
||||
use_vsizip: bool,
|
||||
) -> None:
|
||||
tippecanoe = _require_tippecanoe()
|
||||
dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
|
||||
ndjson_path = Path(tmp) / "trees_outside_woodlands.geojsonseq"
|
||||
feature_count = _write_tree_geojsonseq(
|
||||
dataset_path,
|
||||
ndjson_path,
|
||||
tow_types,
|
||||
batch_size,
|
||||
layer_names,
|
||||
max_features_per_layer,
|
||||
)
|
||||
print(f"Writing {feature_count:,} TOW polygon features")
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
tippecanoe,
|
||||
"--force",
|
||||
"--output",
|
||||
str(output_path),
|
||||
"--layer",
|
||||
"trees_outside_woodlands",
|
||||
"--minimum-zoom",
|
||||
str(min_zoom),
|
||||
"--maximum-zoom",
|
||||
str(max_zoom),
|
||||
"--drop-smallest-as-needed",
|
||||
"--extend-zooms-if-still-dropping",
|
||||
str(ndjson_path),
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("--tow-zip", type=Path, required=True)
|
||||
parser.add_argument("--output", type=Path, required=True)
|
||||
parser.add_argument(
|
||||
"--extract-dir",
|
||||
type=Path,
|
||||
default=Path("property-data/fr_tow_v1_all"),
|
||||
help="Directory used to extract the FileGDB",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tow-type",
|
||||
action="append",
|
||||
dest="tow_types",
|
||||
help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
|
||||
)
|
||||
parser.add_argument("--batch-size", type=int, default=50_000)
|
||||
parser.add_argument("--layer", action="append", dest="layers")
|
||||
parser.add_argument("--max-features-per-layer", type=int)
|
||||
parser.add_argument("--min-zoom", type=int, default=15)
|
||||
parser.add_argument("--max-zoom", type=int, default=17)
|
||||
parser.add_argument("--force-extract", action="store_true")
|
||||
parser.add_argument("--use-vsizip", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
build_tree_overlay_tiles(
|
||||
tow_zip=args.tow_zip,
|
||||
output_path=args.output,
|
||||
extract_dir=args.extract_dir,
|
||||
tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
|
||||
batch_size=args.batch_size,
|
||||
layer_names=tuple(args.layers) if args.layers else None,
|
||||
max_features_per_layer=args.max_features_per_layer,
|
||||
min_zoom=args.min_zoom,
|
||||
max_zoom=args.max_zoom,
|
||||
force_extract=args.force_extract,
|
||||
use_vsizip=args.use_vsizip,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue