improve
This commit is contained in:
parent
8688b7475e
commit
e8345cbdc1
40 changed files with 1980 additions and 904 deletions
|
|
@ -1,16 +1,28 @@
|
|||
"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.
|
||||
"""Derive postcode-scale tree density metrics from Forest Research TOW + NFI data.
|
||||
|
||||
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
|
||||
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
|
||||
postcode-level metric from the tree polygons, then optionally rolls that up to
|
||||
Price Paid street names so the dashboard can answer "what is this address's
|
||||
street like?" without loading the full geodatabase at runtime.
|
||||
postcode-level metric from the tree polygons so the dashboard can answer "how
|
||||
green is this postcode?" without loading the full geodatabase at runtime.
|
||||
|
||||
TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
|
||||
woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
|
||||
proximity (tiny crowns), while large NFI woodland parcels are accumulated by
|
||||
true buffer-clipped intersection area so they cannot saturate a postcode from
|
||||
mere centroid proximity.
|
||||
Every postcode centroid is expanded into a radius-r buffer ("extended area").
|
||||
Both TOW tree crowns and National Forest Inventory (NFI) woodland parcels are
|
||||
accumulated by *true buffer-clipped intersection area*: only the part of each
|
||||
polygon that falls inside a postcode's buffer is counted, never the area that
|
||||
spills outside it. A crown straddling the buffer edge therefore contributes only
|
||||
its inside portion, and a parcel reaching into the buffer from outside is still
|
||||
counted -- no polygon can saturate a postcode from mere proximity.
|
||||
|
||||
TOW only covers trees *outside* woodland, so the NFI woodland layer is the
|
||||
geometric complement of TOW and is optionally unioned in. The two products are
|
||||
*assumed disjoint*: clipped TOW crown area and clipped NFI woodland area are
|
||||
summed into the same per-postcode accumulator, so any spatial overlap between a
|
||||
TOW crown and an NFI parcel (boundary slop where "groups of trees" meet
|
||||
"woodland") would be double-counted. The final density is capped at 100% and
|
||||
_finalize_metrics logs how many postcodes exceed 100% raw coverage, which is a
|
||||
direct symptom of such overlap (or of overlapping crowns within one buffer); if
|
||||
that count is material the products are not disjoint and the NFI clip should be
|
||||
taken against the complement of TOW.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -25,16 +37,12 @@ import numpy as np
|
|||
import polars as pl
|
||||
import pyogrio
|
||||
import shapely
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
|
||||
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
|
||||
STREET_TREE_DENSITY_COL = "Street tree density percentile"
|
||||
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
|
||||
POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
|
||||
POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
|
||||
POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
|
||||
POSTCODE_COUNT_COL = "Tree features within {radius}m"
|
||||
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
|
||||
|
||||
# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
|
||||
|
|
@ -131,13 +139,24 @@ def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Pat
|
|||
def _nfi_dataset_path(
|
||||
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
|
||||
) -> str:
|
||||
"""Resolve the NFI woodland shapefile path, extracting the zip if needed."""
|
||||
"""Resolve the NFI woodland shapefile path, extracting the zip if needed.
|
||||
|
||||
Raises if the archive contains zero or more than one shapefile rather than
|
||||
silently picking one, so an ambiguous NFI release fails loudly instead of
|
||||
accumulating canopy from the wrong layer.
|
||||
"""
|
||||
if use_vsizip:
|
||||
return f"/vsizip/{zip_path.resolve()}"
|
||||
extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
|
||||
shapefiles = sorted(extracted.rglob("*.shp"))
|
||||
if not shapefiles:
|
||||
raise FileNotFoundError(f"No .shp found inside {zip_path}")
|
||||
if len(shapefiles) > 1:
|
||||
names = ", ".join(path.name for path in shapefiles)
|
||||
raise ValueError(
|
||||
f"Expected exactly one shapefile inside {zip_path}, found {len(shapefiles)} "
|
||||
f"({names}); cannot unambiguously pick the NFI woodland layer"
|
||||
)
|
||||
return str(shapefiles[0])
|
||||
|
||||
|
||||
|
|
@ -146,7 +165,7 @@ def _geometry_column(metadata: dict, column_names: list[str]) -> str:
|
|||
geometry_name = metadata.get("geometry_name")
|
||||
if geometry_name:
|
||||
return str(geometry_name)
|
||||
for name in ("wkb_geometry", "geometry", "geom"):
|
||||
for name in ("wkb_geometry", "geometry", "geom", "SHAPE"):
|
||||
if name in column_names:
|
||||
return name
|
||||
return column_names[-1]
|
||||
|
|
@ -184,11 +203,10 @@ def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[
|
|||
return [layer for layer in available if layer in selected_layers]
|
||||
|
||||
|
||||
def _metric_columns(radius_m: int) -> tuple[str, str, str, str]:
|
||||
def _metric_columns(radius_m: int) -> tuple[str, str, str]:
|
||||
return (
|
||||
POSTCODE_DENSITY_COL.format(radius=radius_m),
|
||||
POSTCODE_AREA_COL.format(radius=radius_m),
|
||||
POSTCODE_COUNT_COL.format(radius=radius_m),
|
||||
POSTCODE_HEIGHT_COL.format(radius=radius_m),
|
||||
)
|
||||
|
||||
|
|
@ -198,20 +216,23 @@ def _postcode_density_percentile_col(radius_m: int) -> str:
|
|||
|
||||
|
||||
def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
|
||||
"""Rank higher tree coverage higher on a 0-100 England-wide percentile scale."""
|
||||
"""Rank tree coverage on a 0-100 England-wide percentile scale.
|
||||
|
||||
A single tie-consistent average-rank formula is used for every value so the
|
||||
scale is internally consistent end to end: tied values share their mean rank,
|
||||
so the lowest coverage maps toward 0 and the highest toward 100 only when they
|
||||
are not themselves tied. An all-equal (or single-value) column has no spread
|
||||
and maps to the neutral midpoint (50).
|
||||
"""
|
||||
value = pl.col(column).fill_nan(None)
|
||||
non_null_count = value.count()
|
||||
rank = value.rank("average")
|
||||
return (
|
||||
pl.when(value.is_null())
|
||||
.then(None)
|
||||
.when(value == value.min())
|
||||
.then(0.0)
|
||||
.when(value == value.max())
|
||||
.then(100.0)
|
||||
.when(non_null_count > 1)
|
||||
.then(((rank - 1) / (non_null_count - 1) * 100).round(1))
|
||||
.otherwise(100.0)
|
||||
.otherwise(50.0)
|
||||
.cast(pl.Float32)
|
||||
.alias(alias)
|
||||
)
|
||||
|
|
@ -220,7 +241,7 @@ def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
|
|||
def _with_postcode_density_percentiles(
|
||||
postcode_metrics: pl.DataFrame, radius_m: int
|
||||
) -> pl.DataFrame:
|
||||
density_col, _area_col, _count_col, _height_col = _metric_columns(radius_m)
|
||||
density_col, _area_col, _height_col = _metric_columns(radius_m)
|
||||
return postcode_metrics.with_columns(
|
||||
_coverage_percentile_expr(
|
||||
density_col,
|
||||
|
|
@ -229,28 +250,88 @@ def _with_postcode_density_percentiles(
|
|||
)
|
||||
|
||||
|
||||
def _accumulate_tree_metrics(
|
||||
def _postcode_buffers(
|
||||
points: pl.DataFrame, radius_m: int
|
||||
) -> tuple[np.ndarray, shapely.STRtree]:
|
||||
"""Build a radius-r circle for every postcode plus an STRtree over them.
|
||||
|
||||
Circle index == postcode index, so an STRtree match resolves directly to the
|
||||
postcode accumulator slot.
|
||||
"""
|
||||
xy = points.select("x", "y").to_numpy()
|
||||
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
|
||||
return circles, shapely.STRtree(circles)
|
||||
|
||||
|
||||
def _accumulate_clipped_area(
|
||||
geoms: np.ndarray,
|
||||
circles: np.ndarray,
|
||||
tree: shapely.STRtree,
|
||||
canopy_area: np.ndarray,
|
||||
height: np.ndarray | None = None,
|
||||
height_weighted_sum: np.ndarray | None = None,
|
||||
height_weight: np.ndarray | None = None,
|
||||
) -> None:
|
||||
"""Add each polygon's in-buffer overlap area to every postcode it intersects.
|
||||
|
||||
Only area(polygon ∩ circle) is accumulated -- never the area of the polygon
|
||||
that falls outside the postcode's extended buffer -- so a crown straddling
|
||||
the buffer edge contributes only its inside portion and a large parcel cannot
|
||||
saturate a postcode from mere proximity. When ``height`` is supplied the mean
|
||||
feature height is accumulated weighted by that same clipped overlap area.
|
||||
"""
|
||||
keep = ~shapely.is_missing(geoms) & ~shapely.is_empty(geoms)
|
||||
geoms = geoms[keep]
|
||||
if height is not None:
|
||||
height = height[keep]
|
||||
if geoms.size == 0:
|
||||
return
|
||||
|
||||
# query(predicate="intersects") over the circle STRtree returns exactly the
|
||||
# (polygon, circle) pairs whose clipped overlap can be positive -- i.e. the
|
||||
# polygon overlaps that postcode's radius-r buffer.
|
||||
geom_index, postcode_index = tree.query(geoms, predicate="intersects")
|
||||
if geom_index.size == 0:
|
||||
return
|
||||
|
||||
clipped_area = shapely.area(
|
||||
shapely.intersection(geoms[geom_index], circles[postcode_index])
|
||||
)
|
||||
positive = clipped_area > 0
|
||||
geom_index = geom_index[positive]
|
||||
postcode_index = postcode_index[positive]
|
||||
clipped_area = clipped_area[positive]
|
||||
|
||||
np.add.at(canopy_area, postcode_index, clipped_area)
|
||||
|
||||
if height is not None:
|
||||
feature_height = height[geom_index]
|
||||
finite = np.isfinite(feature_height)
|
||||
if finite.any():
|
||||
np.add.at(
|
||||
height_weighted_sum,
|
||||
postcode_index[finite],
|
||||
feature_height[finite] * clipped_area[finite],
|
||||
)
|
||||
np.add.at(height_weight, postcode_index[finite], clipped_area[finite])
|
||||
|
||||
|
||||
def _accumulate_tow_metrics(
|
||||
dataset_path: str,
|
||||
points: pl.DataFrame,
|
||||
radius_m: int,
|
||||
circles: np.ndarray,
|
||||
tree: shapely.STRtree,
|
||||
canopy_area: np.ndarray,
|
||||
height_weighted_sum: np.ndarray,
|
||||
height_weight: np.ndarray,
|
||||
batch_size: int,
|
||||
layer_names: tuple[str, ...] | None,
|
||||
max_features_per_layer: int | None,
|
||||
workers: int,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
height_weighted_sum: np.ndarray,
|
||||
height_weight: np.ndarray,
|
||||
) -> None:
|
||||
xy = points.select("x", "y").to_numpy()
|
||||
tree = cKDTree(xy)
|
||||
|
||||
layers = _layers(dataset_path, layer_names)
|
||||
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
||||
|
||||
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
|
||||
columns = ["MEANHT"]
|
||||
total_features_seen = 0
|
||||
total_features_used = 0
|
||||
|
||||
for layer in layers:
|
||||
info = pyogrio.read_info(dataset_path, layer=layer)
|
||||
|
|
@ -263,7 +344,7 @@ def _accumulate_tree_metrics(
|
|||
columns=columns,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=True,
|
||||
) as (_meta, reader):
|
||||
) as (meta, reader):
|
||||
for batch_index, batch in enumerate(reader, start=1):
|
||||
if max_features_per_layer is not None:
|
||||
remaining = max_features_per_layer - layer_features_seen
|
||||
|
|
@ -275,135 +356,29 @@ def _accumulate_tree_metrics(
|
|||
layer_features_seen += batch.num_rows
|
||||
total_features_seen += batch.num_rows
|
||||
names = batch.schema.names
|
||||
area = np.asarray(
|
||||
batch.column(names.index("TOW_Area_M")).to_numpy(zero_copy_only=False),
|
||||
dtype=np.float64,
|
||||
)
|
||||
geometry_column = _geometry_column(meta, names)
|
||||
height = np.asarray(
|
||||
batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
|
||||
dtype=np.float64,
|
||||
)
|
||||
geometry = np.asarray(
|
||||
batch.column(names.index("SHAPE")).to_numpy(zero_copy_only=False),
|
||||
batch.column(names.index(geometry_column)).to_numpy(
|
||||
zero_copy_only=False
|
||||
),
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
valid = np.isfinite(area) & (area > 0)
|
||||
if not valid.any():
|
||||
continue
|
||||
|
||||
geometry = geometry[valid]
|
||||
area = area[valid]
|
||||
height = height[valid]
|
||||
|
||||
centroids = shapely.centroid(shapely.from_wkb(geometry))
|
||||
x = shapely.get_x(centroids)
|
||||
y = shapely.get_y(centroids)
|
||||
valid_xy = np.isfinite(x) & np.isfinite(y)
|
||||
if not valid_xy.any():
|
||||
continue
|
||||
|
||||
x = x[valid_xy]
|
||||
y = y[valid_xy]
|
||||
area = area[valid_xy]
|
||||
height = height[valid_xy]
|
||||
|
||||
nearby = tree.query_ball_point(
|
||||
np.column_stack((x, y)), radius_m, workers=workers
|
||||
_accumulate_clipped_area(
|
||||
shapely.from_wkb(geometry),
|
||||
circles,
|
||||
tree,
|
||||
canopy_area,
|
||||
height=height,
|
||||
height_weighted_sum=height_weighted_sum,
|
||||
height_weight=height_weight,
|
||||
)
|
||||
lengths = np.fromiter(
|
||||
(len(postcode_indexes) for postcode_indexes in nearby),
|
||||
dtype=np.int32,
|
||||
count=len(nearby),
|
||||
)
|
||||
matching_features = lengths > 0
|
||||
if matching_features.any():
|
||||
postcode_indexes = np.concatenate(
|
||||
[indexes for indexes in nearby if indexes]
|
||||
).astype(np.int64, copy=False)
|
||||
feature_indexes = np.repeat(
|
||||
np.flatnonzero(matching_features), lengths[matching_features]
|
||||
)
|
||||
|
||||
np.add.at(canopy_area, postcode_indexes, area[feature_indexes])
|
||||
np.add.at(feature_count, postcode_indexes, 1)
|
||||
|
||||
feature_height = height[feature_indexes]
|
||||
valid_height = np.isfinite(feature_height)
|
||||
if valid_height.any():
|
||||
height_area = area[feature_indexes][valid_height]
|
||||
np.add.at(
|
||||
height_weighted_sum,
|
||||
postcode_indexes[valid_height],
|
||||
feature_height[valid_height] * height_area,
|
||||
)
|
||||
np.add.at(
|
||||
height_weight,
|
||||
postcode_indexes[valid_height],
|
||||
height_area,
|
||||
)
|
||||
|
||||
total_features_used += len(area)
|
||||
if batch_index == 1 or batch_index % 25 == 0:
|
||||
print(
|
||||
f" batch {batch_index:,}: "
|
||||
f"{total_features_seen:,} rows read, "
|
||||
f"{total_features_used:,} features with usable centroids"
|
||||
)
|
||||
|
||||
|
||||
def _postcode_buffers(
|
||||
points: pl.DataFrame, radius_m: int
|
||||
) -> tuple[np.ndarray, shapely.STRtree]:
|
||||
"""Build a radius-r circle for every postcode plus an STRtree over them.
|
||||
|
||||
Circle index == postcode index, matching the order used by the cKDTree path.
|
||||
"""
|
||||
xy = points.select("x", "y").to_numpy()
|
||||
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
|
||||
return circles, shapely.STRtree(circles)
|
||||
|
||||
|
||||
def _add_nfi_batch(
|
||||
geoms: np.ndarray,
|
||||
category: np.ndarray,
|
||||
circles: np.ndarray,
|
||||
tree: shapely.STRtree,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
radius_m: int,
|
||||
) -> None:
|
||||
"""Add NFI woodland into the shared arrays by true buffer-clipped area.
|
||||
|
||||
Unlike the TOW centroid path, this clips each woodland polygon to each
|
||||
nearby postcode circle and adds only area(polygon ∩ circle); a large parcel
|
||||
therefore cannot saturate a postcode from mere centroid proximity, and a
|
||||
buffer-filling parcel whose centroid is outside the radius is not missed.
|
||||
"""
|
||||
keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
|
||||
geoms = geoms[keep]
|
||||
if geoms.size:
|
||||
geoms = geoms[~shapely.is_empty(geoms)]
|
||||
if geoms.size == 0:
|
||||
return
|
||||
|
||||
# dwithin(polygon, point, r) is true iff the radius-r circle around the
|
||||
# point intersects the polygon -- exactly the candidate set we want.
|
||||
nfi_index, postcode_index = tree.query(
|
||||
geoms, predicate="dwithin", distance=radius_m
|
||||
)
|
||||
if nfi_index.size == 0:
|
||||
return
|
||||
|
||||
clipped_area = shapely.area(
|
||||
shapely.intersection(geoms[nfi_index], circles[postcode_index])
|
||||
)
|
||||
positive = clipped_area > 0
|
||||
postcode_index = postcode_index[positive]
|
||||
clipped_area = clipped_area[positive]
|
||||
|
||||
np.add.at(canopy_area, postcode_index, clipped_area)
|
||||
np.add.at(feature_count, postcode_index, 1)
|
||||
print(f" batch {batch_index:,}: {total_features_seen:,} rows read")
|
||||
|
||||
|
||||
def _accumulate_nfi_metrics(
|
||||
|
|
@ -411,8 +386,6 @@ def _accumulate_nfi_metrics(
|
|||
circles: np.ndarray,
|
||||
tree: shapely.STRtree,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
radius_m: int,
|
||||
batch_size: int,
|
||||
max_nfi_features: int | None,
|
||||
) -> None:
|
||||
|
|
@ -455,14 +428,12 @@ def _accumulate_nfi_metrics(
|
|||
),
|
||||
dtype=object,
|
||||
)
|
||||
_add_nfi_batch(
|
||||
shapely.from_wkb(geometry),
|
||||
category,
|
||||
geoms = shapely.from_wkb(geometry)
|
||||
_accumulate_clipped_area(
|
||||
geoms[category == NFI_WOODLAND_VALUE],
|
||||
circles,
|
||||
tree,
|
||||
canopy_area,
|
||||
feature_count,
|
||||
radius_m,
|
||||
)
|
||||
if batch_index == 1 or batch_index % 25 == 0:
|
||||
print(f" NFI batch {batch_index:,}: {features_seen:,} rows read")
|
||||
|
|
@ -471,15 +442,26 @@ def _accumulate_nfi_metrics(
|
|||
def _finalize_metrics(
|
||||
points: pl.DataFrame,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
height_weighted_sum: np.ndarray,
|
||||
height_weight: np.ndarray,
|
||||
radius_m: int,
|
||||
) -> pl.DataFrame:
|
||||
n_points = points.height
|
||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||
density_col, area_col, height_col = _metric_columns(radius_m)
|
||||
buffer_area = math.pi * radius_m * radius_m
|
||||
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
|
||||
raw_density = canopy_area / buffer_area * 100.0
|
||||
density_pct = np.minimum(raw_density, 100.0)
|
||||
|
||||
# Symptom of the assumed-disjoint TOW/NFI union being violated (or of
|
||||
# overlapping crowns inside one buffer): clipped areas alone cannot exceed the
|
||||
# buffer unless polygons overlap. Surface it rather than hide it behind the cap.
|
||||
over_count = int(np.count_nonzero(raw_density > 100.0))
|
||||
if over_count:
|
||||
print(
|
||||
f" note: {over_count:,} postcode(s) exceeded 100% raw canopy and were "
|
||||
"capped — indicates overlapping TOW/NFI canopy within the buffer"
|
||||
)
|
||||
|
||||
mean_height = np.divide(
|
||||
height_weighted_sum,
|
||||
height_weight,
|
||||
|
|
@ -492,7 +474,6 @@ def _finalize_metrics(
|
|||
"postcode": points["postcode"],
|
||||
area_col: canopy_area.round(1).astype(np.float32),
|
||||
density_col: density_pct.round(1).astype(np.float32),
|
||||
count_col: feature_count.astype(np.uint32),
|
||||
height_col: np.round(mean_height, 1).astype(np.float32),
|
||||
}
|
||||
).with_columns(
|
||||
|
|
@ -500,181 +481,9 @@ def _finalize_metrics(
|
|||
)
|
||||
|
||||
|
||||
def _clean_key_expr(column: str) -> pl.Expr:
|
||||
return (
|
||||
pl.col(column)
|
||||
.fill_null("")
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^A-Z0-9]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
|
||||
|
||||
def _latest_price_paid_addresses(price_paid_path: Path) -> pl.LazyFrame:
|
||||
return (
|
||||
pl.scan_parquet(price_paid_path)
|
||||
.select(
|
||||
pl.col("postcode").str.strip_chars().str.to_uppercase().alias("postcode"),
|
||||
"paon",
|
||||
"saon",
|
||||
"street",
|
||||
"locality",
|
||||
"town_city",
|
||||
"district",
|
||||
"county",
|
||||
"date_of_transfer",
|
||||
)
|
||||
.filter(pl.col("postcode").is_not_null())
|
||||
.filter(pl.col("street").is_not_null())
|
||||
.filter(_clean_key_expr("street") != "")
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
)
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
.alias("pp_address"),
|
||||
)
|
||||
.filter(pl.col("pp_address").is_not_null())
|
||||
.sort("date_of_transfer")
|
||||
.group_by("postcode", "pp_address", maintain_order=True)
|
||||
.agg(
|
||||
pl.col("street").last(),
|
||||
pl.col("locality").last(),
|
||||
pl.col("town_city").last(),
|
||||
pl.col("district").last(),
|
||||
pl.col("county").last(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[
|
||||
_clean_key_expr("street"),
|
||||
_clean_key_expr("town_city"),
|
||||
_clean_key_expr("district"),
|
||||
_clean_key_expr("county"),
|
||||
],
|
||||
separator="|",
|
||||
).alias("street_key")
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _weighted_mean_expr(column: str, weight: str) -> pl.Expr:
|
||||
valid = pl.col(column).is_not_null() & ~pl.col(column).is_nan()
|
||||
numerator = pl.when(valid).then(pl.col(column) * pl.col(weight)).sum()
|
||||
denominator = pl.when(valid).then(pl.col(weight)).sum()
|
||||
return pl.when(denominator > 0).then(numerator / denominator).otherwise(None)
|
||||
|
||||
|
||||
def _write_street_rollups(
|
||||
postcode_metrics: pl.DataFrame,
|
||||
price_paid_path: Path,
|
||||
output_streets: Path | None,
|
||||
output_addresses: Path | None,
|
||||
radius_m: int,
|
||||
) -> None:
|
||||
if output_streets is None and output_addresses is None:
|
||||
return
|
||||
|
||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||
metrics = postcode_metrics.lazy()
|
||||
addresses = _latest_price_paid_addresses(price_paid_path).join(
|
||||
metrics, on="postcode", how="inner"
|
||||
)
|
||||
|
||||
per_postcode = (
|
||||
addresses.group_by(
|
||||
"street_key",
|
||||
"postcode",
|
||||
"street",
|
||||
"locality",
|
||||
"town_city",
|
||||
"district",
|
||||
"county",
|
||||
)
|
||||
.agg(
|
||||
pl.len().alias("address_count"),
|
||||
pl.col(density_col).first(),
|
||||
pl.col(area_col).first(),
|
||||
pl.col(count_col).first(),
|
||||
pl.col(height_col).first(),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
streets = (
|
||||
per_postcode.lazy()
|
||||
.group_by("street_key")
|
||||
.agg(
|
||||
pl.col("street").first(),
|
||||
pl.col("locality").first(),
|
||||
pl.col("town_city").first(),
|
||||
pl.col("district").first(),
|
||||
pl.col("county").first(),
|
||||
pl.col("postcode").n_unique().alias("postcode_count"),
|
||||
pl.col("address_count").sum().alias("address_count"),
|
||||
_weighted_mean_expr(density_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(STREET_TREE_COVERAGE_COL),
|
||||
_weighted_mean_expr(area_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(f"Street average {area_col}"),
|
||||
_weighted_mean_expr(count_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(f"Street average {count_col}"),
|
||||
_weighted_mean_expr(height_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(f"Street average {height_col}"),
|
||||
)
|
||||
.with_columns(
|
||||
_coverage_percentile_expr(
|
||||
STREET_TREE_COVERAGE_COL,
|
||||
STREET_TREE_DENSITY_COL,
|
||||
)
|
||||
)
|
||||
.sort("street_key")
|
||||
.collect()
|
||||
)
|
||||
|
||||
if output_addresses is not None:
|
||||
output_addresses.parent.mkdir(parents=True, exist_ok=True)
|
||||
address_output = addresses.join(
|
||||
streets.lazy().select(
|
||||
"street_key",
|
||||
STREET_TREE_COVERAGE_COL,
|
||||
STREET_TREE_DENSITY_COL,
|
||||
),
|
||||
on="street_key",
|
||||
how="left",
|
||||
)
|
||||
address_output.sink_parquet(output_addresses, compression="zstd")
|
||||
print(f"Wrote address tree-density join: {output_addresses}")
|
||||
|
||||
if output_streets is not None:
|
||||
output_streets.parent.mkdir(parents=True, exist_ok=True)
|
||||
streets.write_parquet(output_streets, compression="zstd")
|
||||
print(f"Wrote street tree-density rollup: {output_streets}")
|
||||
|
||||
|
||||
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
|
||||
if value is None:
|
||||
return None
|
||||
if value.lower() == "all":
|
||||
return None
|
||||
parts = tuple(part.strip() for part in value.split(",") if part.strip())
|
||||
return parts or None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build postcode and street tree-density metrics from FR_TOW_V1_ALL.zip"
|
||||
description="Build postcode-level tree-density metrics from FR_TOW_V1_ALL.zip"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tow-zip",
|
||||
|
|
@ -716,35 +525,17 @@ def main() -> None:
|
|||
default=Path("property-data/arcgis_data.parquet"),
|
||||
help="Postcode centroid parquet with east1m/north1m columns",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--price-paid",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional Price Paid parquet used to roll postcode metrics up to streets",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output postcode-level tree-density parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-streets",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional output street-level tree-density parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-addresses",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional output address/street join parquet keyed by postcode and pp_address",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--radius-m",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Radius around each postcode centroid used as the street-scale buffer",
|
||||
help="Radius around each postcode centroid used as the extended buffer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--layers",
|
||||
|
|
@ -757,12 +548,6 @@ def main() -> None:
|
|||
default=65_536,
|
||||
help="Arrow batch size for reading TOW features",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Worker count passed to scipy cKDTree.query_ball_point",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-postcodes",
|
||||
type=int,
|
||||
|
|
@ -783,9 +568,6 @@ def main() -> None:
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if (args.output_streets or args.output_addresses) and args.price_paid is None:
|
||||
raise SystemExit("--price-paid is required when writing street/address outputs")
|
||||
|
||||
if args.radius_m <= 0:
|
||||
raise SystemExit("--radius-m must be greater than zero")
|
||||
|
||||
|
|
@ -797,36 +579,32 @@ def main() -> None:
|
|||
|
||||
n_points = points.height
|
||||
canopy_area = np.zeros(n_points, dtype=np.float64)
|
||||
feature_count = np.zeros(n_points, dtype=np.uint32)
|
||||
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
||||
height_weight = np.zeros(n_points, dtype=np.float64)
|
||||
|
||||
_accumulate_tree_metrics(
|
||||
circles, tree = _postcode_buffers(points, args.radius_m)
|
||||
|
||||
_accumulate_tow_metrics(
|
||||
dataset_path=dataset_path,
|
||||
points=points,
|
||||
radius_m=args.radius_m,
|
||||
circles=circles,
|
||||
tree=tree,
|
||||
canopy_area=canopy_area,
|
||||
height_weighted_sum=height_weighted_sum,
|
||||
height_weight=height_weight,
|
||||
batch_size=args.batch_size,
|
||||
layer_names=layer_names,
|
||||
max_features_per_layer=args.max_features_per_layer,
|
||||
workers=args.workers,
|
||||
canopy_area=canopy_area,
|
||||
feature_count=feature_count,
|
||||
height_weighted_sum=height_weighted_sum,
|
||||
height_weight=height_weight,
|
||||
)
|
||||
|
||||
if args.nfi_zip is not None and args.nfi_zip.exists():
|
||||
nfi_path = _nfi_dataset_path(
|
||||
args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
|
||||
)
|
||||
circles, nfi_tree = _postcode_buffers(points, args.radius_m)
|
||||
_accumulate_nfi_metrics(
|
||||
dataset_path=nfi_path,
|
||||
circles=circles,
|
||||
tree=nfi_tree,
|
||||
tree=tree,
|
||||
canopy_area=canopy_area,
|
||||
feature_count=feature_count,
|
||||
radius_m=args.radius_m,
|
||||
batch_size=args.batch_size,
|
||||
max_nfi_features=args.max_nfi_features,
|
||||
)
|
||||
|
|
@ -836,7 +614,6 @@ def main() -> None:
|
|||
postcode_metrics = _finalize_metrics(
|
||||
points,
|
||||
canopy_area,
|
||||
feature_count,
|
||||
height_weighted_sum,
|
||||
height_weight,
|
||||
args.radius_m,
|
||||
|
|
@ -849,14 +626,14 @@ def main() -> None:
|
|||
postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
|
||||
print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")
|
||||
|
||||
if args.price_paid is not None:
|
||||
_write_street_rollups(
|
||||
postcode_metrics=postcode_metrics,
|
||||
price_paid_path=args.price_paid,
|
||||
output_streets=args.output_streets,
|
||||
output_addresses=args.output_addresses,
|
||||
radius_m=args.radius_m,
|
||||
)
|
||||
|
||||
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
|
||||
if value is None:
|
||||
return None
|
||||
if value.lower() == "all":
|
||||
return None
|
||||
parts = tuple(part.strip() for part in value.split(",") if part.strip())
|
||||
return parts or None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue