perfect-postcode/pipeline/transform/tree_overlay_tiles.py
2026-05-25 13:20:17 +01:00

269 lines
9.2 KiB
Python

"""Build PMTiles polygon tiles for the Trees Outside Woodland overlay."""
from __future__ import annotations
import argparse
import json
import shutil
import subprocess
import tempfile
from pathlib import Path
import numpy as np
import pyogrio
import shapely
from pyproj import Transformer
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.tree_density import (
DEFAULT_TOW_TYPES,
_layers,
_tow_dataset_path,
_where_for_tow_types,
)
def _require_tippecanoe() -> str:
executable = shutil.which("tippecanoe")
if executable is None:
raise RuntimeError(
"tippecanoe is required to build tree overlay PMTiles. "
"Install tippecanoe and rerun this target."
)
return executable
def _column_or_none(batch, names: list[str], column: str):
if column not in names:
return None
return batch.column(names.index(column)).to_numpy(zero_copy_only=False)
def _number_or_none(value) -> float | int | None:
if value is None:
return None
try:
if np.isfinite(value):
if float(value).is_integer():
return int(value)
return round(float(value), 2)
except TypeError:
return None
return None
def _write_tree_geojsonseq(
dataset_path: str,
output_path: Path,
tow_types: tuple[str, ...],
batch_size: int,
layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None,
) -> int:
to_wgs84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
where = _where_for_tow_types(tow_types)
layers = _layers(dataset_path, layer_names)
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
if where:
print(f"TOW type filter: {where}")
columns = [
"TOW_ID",
"Woodland_Type",
"TOW_Area_M",
"MEANHT",
"MINHT",
"MAXHT",
"LiDAR_Survey_Year",
]
feature_count = 0
with output_path.open("w") as file:
for layer in layers:
info = pyogrio.read_info(dataset_path, layer=layer)
print(f"\nLayer {layer}: {info.get('features', 0):,} features")
layer_features_seen = 0
with pyogrio.open_arrow(
dataset_path,
layer=layer,
columns=columns,
where=where,
batch_size=batch_size,
use_pyarrow=True,
) as (_meta, reader):
for batch in reader:
if max_features_per_layer is not None:
remaining = max_features_per_layer - layer_features_seen
if remaining <= 0:
break
if batch.num_rows > remaining:
batch = batch.slice(0, remaining)
layer_features_seen += batch.num_rows
names = batch.schema.names
area = np.asarray(
batch.column(names.index("TOW_Area_M")).to_numpy(
zero_copy_only=False
),
dtype=np.float64,
)
geometry = np.asarray(
batch.column(names.index("SHAPE")).to_numpy(
zero_copy_only=False
),
dtype=object,
)
valid = np.isfinite(area) & (area > 0)
if not valid.any():
continue
tow_id = _column_or_none(batch, names, "TOW_ID")
woodland_type = _column_or_none(batch, names, "Woodland_Type")
mean_height = _column_or_none(batch, names, "MEANHT")
min_height = _column_or_none(batch, names, "MINHT")
max_height = _column_or_none(batch, names, "MAXHT")
lidar_year = _column_or_none(batch, names, "LiDAR_Survey_Year")
geometries = shapely.from_wkb(geometry[valid])
geometries = shapely.transform(
geometries,
to_wgs84.transform,
interleaved=False,
)
geometries_json = shapely.to_geojson(geometries)
valid_indexes = np.flatnonzero(valid)
for idx, geometry_json in zip(valid_indexes, geometries_json):
properties = {
"tow_id": str(tow_id[idx]) if tow_id is not None else "",
"woodland_type": (
str(woodland_type[idx])
if woodland_type is not None
else ""
),
"area_sqm": _number_or_none(area[idx]),
"mean_height_m": (
_number_or_none(mean_height[idx])
if mean_height is not None
else None
),
"min_height_m": (
_number_or_none(min_height[idx])
if min_height is not None
else None
),
"max_height_m": (
_number_or_none(max_height[idx])
if max_height is not None
else None
),
"lidar_year": (
_number_or_none(lidar_year[idx])
if lidar_year is not None
else None
),
"source_layer": layer,
}
feature = {
"type": "Feature",
"geometry": json.loads(geometry_json),
"properties": properties,
}
file.write(json.dumps(feature, separators=(",", ":")) + "\n")
feature_count += 1
return feature_count
def build_tree_overlay_tiles(
tow_zip: Path,
output_path: Path,
extract_dir: Path,
tow_types: tuple[str, ...],
batch_size: int,
layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None,
min_zoom: int,
max_zoom: int,
force_extract: bool,
use_vsizip: bool,
) -> None:
tippecanoe = _require_tippecanoe()
dataset_path = _tow_dataset_path(tow_zip, extract_dir, force_extract, use_vsizip)
output_path.parent.mkdir(parents=True, exist_ok=True)
with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
ndjson_path = Path(tmp) / "trees_outside_woodlands.geojsonseq"
feature_count = _write_tree_geojsonseq(
dataset_path,
ndjson_path,
tow_types,
batch_size,
layer_names,
max_features_per_layer,
)
print(f"Writing {feature_count:,} TOW polygon features")
subprocess.run(
[
tippecanoe,
"--force",
"--output",
str(output_path),
"--layer",
"trees_outside_woodlands",
"--minimum-zoom",
str(min_zoom),
"--maximum-zoom",
str(max_zoom),
"--drop-smallest-as-needed",
"--extend-zooms-if-still-dropping",
str(ndjson_path),
],
check=True,
)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--tow-zip", type=Path, required=True)
parser.add_argument("--output", type=Path, required=True)
parser.add_argument(
"--extract-dir",
type=Path,
default=Path("property-data/fr_tow_v1_all"),
help="Directory used to extract the FileGDB",
)
parser.add_argument(
"--tow-type",
action="append",
dest="tow_types",
help="Woodland_Type to include; repeatable. Defaults to TOW outside-woodland classes.",
)
parser.add_argument("--batch-size", type=int, default=50_000)
parser.add_argument("--layer", action="append", dest="layers")
parser.add_argument("--max-features-per-layer", type=int)
parser.add_argument("--min-zoom", type=int, default=15)
parser.add_argument("--max-zoom", type=int, default=17)
parser.add_argument("--force-extract", action="store_true")
parser.add_argument("--use-vsizip", action="store_true")
args = parser.parse_args()
build_tree_overlay_tiles(
tow_zip=args.tow_zip,
output_path=args.output,
extract_dir=args.extract_dir,
tow_types=tuple(args.tow_types or DEFAULT_TOW_TYPES),
batch_size=args.batch_size,
layer_names=tuple(args.layers) if args.layers else None,
max_features_per_layer=args.max_features_per_layer,
min_zoom=args.min_zoom,
max_zoom=args.max_zoom,
force_extract=args.force_extract,
use_vsizip=args.use_vsizip,
)
if __name__ == "__main__":
main()