This commit is contained in:
Andras Schmelczer 2026-05-12 22:00:56 +01:00
parent 8708bf000d
commit 11711c57e6
38 changed files with 5361 additions and 265 deletions

View file

@ -1,6 +1,8 @@
import polars as pl
from pipeline.transform.merge import (
_AREA_COLUMNS,
_STATIC_POI_DISTANCE_RENAMES,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
)
@ -27,7 +29,20 @@ def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
def test_static_poi_distance_columns_are_renamed_to_configured_area_features() -> None:
expected = {
"parks_nearest_km": "Distance to nearest park (km)",
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
"cafe_nearest_km": "Distance to nearest cafe (km)",
"pub_nearest_km": "Distance to nearest pub (km)",
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
}
assert _STATIC_POI_DISTANCE_RENAMES == expected
assert set(expected.values()).issubset(_AREA_COLUMNS)

View file

@ -1,6 +1,9 @@
import polars as pl
from pipeline.transform.poi_proximity import _build_poi_category_groups
from pipeline.transform.poi_proximity import (
_build_poi_category_groups,
_dynamic_poi_metric_renames,
)
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
@ -11,6 +14,7 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
+ ["Leisure"] * 2
+ ["Groceries"] * 101
+ ["Groceries"] * 100
+ ["Leisure"] * 10
+ ["Education"] * 200
+ ["Health"] * 200
),
@ -19,11 +23,12 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
+ ["Café", "Restaurant"]
+ ["Tesco"] * 101
+ ["Waitrose"] * 100
+ ["Park"] * 10
+ ["School"] * 200
+ ["Pharmacy"] * 200
),
"lat": [51.5] * 605,
"lng": [-0.1] * 605,
"lat": [51.5] * 615,
"lng": [-0.1] * 615,
}
)
@ -37,5 +42,14 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
"Tesco",
}
assert "poi_waitrose" not in groups
assert "poi_park" not in groups
assert "poi_school" not in groups
assert "poi_pharmacy" not in groups
def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
assert _dynamic_poi_metric_renames({"parks": "Park"}) == {
"parks_nearest_km": "Distance to nearest amenity (Park) (km)",
"parks_2km": "Number of amenities (Park) within 2km",
"parks_5km": "Number of amenities (Park) within 5km",
}

View file

@ -0,0 +1,99 @@
from pathlib import Path
import polars as pl
import pytest
from pipeline.transform.tree_density import (
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
_coverage_percentile_expr,
_metric_columns,
_postcode_density_percentile_col,
_with_postcode_density_percentiles,
_write_street_rollups,
)
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})
result = df.lazy().with_columns(
_coverage_percentile_expr("coverage", "percentile")
).collect()
assert result["percentile"].to_list() == [0.0, 50.0, 100.0, None]
def test_coverage_percentile_expr_uses_exact_scale_endpoints() -> None:
df = pl.DataFrame({"coverage": [0.0, 0.0, 5.0, 10.0, 10.0]})
result = df.lazy().with_columns(
_coverage_percentile_expr("coverage", "percentile")
).collect()
assert result["percentile"].to_list() == [0.0, 0.0, 50.0, 100.0, 100.0]
def test_street_rollup_percentiles_are_ranked_over_raw_street_coverage(
tmp_path: Path,
) -> None:
radius_m = 50
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
percentile_col = _postcode_density_percentile_col(radius_m)
postcode_metrics = _with_postcode_density_percentiles(
pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AB", "AA1 1AC"],
density_col: [10.0, 30.0, 50.0],
area_col: [100.0, 300.0, 500.0],
count_col: [1, 3, 5],
height_col: [4.0, 6.0, 8.0],
}
),
radius_m,
)
price_paid = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AB", "AA1 1AC"],
"paon": ["1", "2", "3", "4"],
"saon": ["", "", "", ""],
"street": ["Oak Road", "Oak Road", "Oak Road", "Elm Street"],
"locality": ["", "", "", ""],
"town_city": ["Test Town", "Test Town", "Test Town", "Test Town"],
"district": ["Test District"] * 4,
"county": ["Test County"] * 4,
"date_of_transfer": [
"2024-01-01",
"2024-01-02",
"2024-01-03",
"2024-01-04",
],
}
)
price_paid_path = tmp_path / "price-paid.parquet"
output_streets = tmp_path / "streets.parquet"
output_addresses = tmp_path / "addresses.parquet"
price_paid.write_parquet(price_paid_path)
_write_street_rollups(
postcode_metrics=postcode_metrics,
price_paid_path=price_paid_path,
output_streets=output_streets,
output_addresses=output_addresses,
radius_m=radius_m,
)
streets = pl.read_parquet(output_streets).sort("street")
addresses = pl.read_parquet(output_addresses)
assert streets["street"].to_list() == ["Elm Street", "Oak Road"]
assert streets[STREET_TREE_COVERAGE_COL].to_list() == pytest.approx([50.0, 16.7])
assert streets.select("street", STREET_TREE_DENSITY_COL).rows() == [
("Elm Street", 100.0),
("Oak Road", 0.0),
]
assert percentile_col in addresses.columns
assert STREET_TREE_COVERAGE_COL in addresses.columns
assert STREET_TREE_DENSITY_COL in addresses.columns

View file

@ -0,0 +1,635 @@
"""Derive street-scale tree density metrics from Forest Research TOW data.
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
postcode-level metric from the tree polygons, then optionally rolls that up to
Price Paid street names so the dashboard can answer "what is this address's
street like?" without loading the full geodatabase at runtime.
"""
from __future__ import annotations
import argparse
import math
import shutil
import zipfile
from pathlib import Path
import numpy as np
import polars as pl
import pyogrio
import shapely
from scipy.spatial import cKDTree
DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
STREET_TREE_DENSITY_COL = "Street tree density percentile"
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
POSTCODE_COUNT_COL = "Tree features within {radius}m"
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
"""Extract the TOW zip and return the extracted .gdb path."""
gdb_path = extract_dir / TOW_GDB_NAME
if gdb_path.exists() and not force:
print(f"Using existing extracted geodatabase: {gdb_path}")
return gdb_path
if force and extract_dir.exists():
shutil.rmtree(extract_dir)
elif extract_dir.exists():
print(f"Removing incomplete extraction directory: {extract_dir}")
shutil.rmtree(extract_dir)
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
if tmp_dir.exists():
shutil.rmtree(tmp_dir)
tmp_dir.mkdir(parents=True)
root = tmp_dir.resolve()
print(f"Extracting {zip_path} to {extract_dir}...")
with zipfile.ZipFile(zip_path) as archive:
for member in archive.infolist():
target = (tmp_dir / member.filename).resolve()
if root != target and root not in target.parents:
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
if member.is_dir():
target.mkdir(parents=True, exist_ok=True)
continue
target.parent.mkdir(parents=True, exist_ok=True)
with archive.open(member) as source, target.open("wb") as dest:
shutil.copyfileobj(source, dest, length=1024 * 1024)
if not (tmp_dir / TOW_GDB_NAME).exists():
raise FileNotFoundError(f"{TOW_GDB_NAME} was not found inside {zip_path}")
tmp_dir.rename(extract_dir)
print(f"Extracted geodatabase: {gdb_path}")
return gdb_path
def _tow_dataset_path(
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
) -> str:
if use_vsizip:
return f"/vsizip/{zip_path.resolve()}/{TOW_GDB_NAME}"
return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None:
if not tow_types:
return None
escaped = [tow_type.replace("'", "''") for tow_type in tow_types]
values = ", ".join(f"'{tow_type}'" for tow_type in escaped)
return f"Woodland_Type IN ({values})"
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
points = (
pl.scan_parquet(arcgis_path)
.filter(pl.col("ctry25cd") == "E92000001")
.filter(pl.col("doterm").is_null())
.select(
pl.col("pcds").alias("postcode"),
pl.col("east1m").cast(pl.Float64).alias("x"),
pl.col("north1m").cast(pl.Float64).alias("y"),
)
.drop_nulls(["postcode", "x", "y"])
.unique("postcode")
.sort("postcode")
)
if max_postcodes is not None:
points = points.head(max_postcodes)
df = points.collect()
print(f"Loaded {df.height:,} active English postcode points")
return df
def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[str]:
available = [layer for layer, _geometry_type in pyogrio.list_layers(dataset_path)]
if selected_layers is None:
return available
missing = sorted(set(selected_layers) - set(available))
if missing:
raise ValueError(f"Unknown TOW layer(s): {', '.join(missing)}")
return [layer for layer in available if layer in selected_layers]
def _metric_columns(radius_m: int) -> tuple[str, str, str, str]:
return (
POSTCODE_DENSITY_COL.format(radius=radius_m),
POSTCODE_AREA_COL.format(radius=radius_m),
POSTCODE_COUNT_COL.format(radius=radius_m),
POSTCODE_HEIGHT_COL.format(radius=radius_m),
)
def _postcode_density_percentile_col(radius_m: int) -> str:
return POSTCODE_DENSITY_PERCENTILE_COL.format(radius=radius_m)
def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
"""Rank higher tree coverage higher on a 0-100 England-wide percentile scale."""
value = pl.col(column).fill_nan(None)
non_null_count = value.count()
rank = value.rank("average")
return (
pl.when(value.is_null())
.then(None)
.when(value == value.min())
.then(0.0)
.when(value == value.max())
.then(100.0)
.when(non_null_count > 1)
.then(((rank - 1) / (non_null_count - 1) * 100).round(1))
.otherwise(100.0)
.cast(pl.Float32)
.alias(alias)
)
def _with_postcode_density_percentiles(
postcode_metrics: pl.DataFrame, radius_m: int
) -> pl.DataFrame:
density_col, _area_col, _count_col, _height_col = _metric_columns(radius_m)
return postcode_metrics.with_columns(
_coverage_percentile_expr(
density_col,
_postcode_density_percentile_col(radius_m),
)
)
def _accumulate_tree_metrics(
dataset_path: str,
points: pl.DataFrame,
radius_m: int,
tow_types: tuple[str, ...] | None,
batch_size: int,
layer_names: tuple[str, ...] | None,
max_features_per_layer: int | None,
workers: int,
) -> pl.DataFrame:
xy = points.select("x", "y").to_numpy()
tree = cKDTree(xy)
n_points = points.height
canopy_area = np.zeros(n_points, dtype=np.float64)
feature_count = np.zeros(n_points, dtype=np.uint32)
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
height_weight = np.zeros(n_points, dtype=np.float64)
where = _where_for_tow_types(tow_types)
layers = _layers(dataset_path, layer_names)
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
if where:
print(f"TOW type filter: {where}")
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
total_features_seen = 0
total_features_used = 0
for layer in layers:
info = pyogrio.read_info(dataset_path, layer=layer)
print(f"\nLayer {layer}: {info.get('features', 0):,} features")
layer_features_seen = 0
with pyogrio.open_arrow(
dataset_path,
layer=layer,
columns=columns,
where=where,
batch_size=batch_size,
use_pyarrow=True,
) as (_meta, reader):
for batch_index, batch in enumerate(reader, start=1):
if max_features_per_layer is not None:
remaining = max_features_per_layer - layer_features_seen
if remaining <= 0:
break
if batch.num_rows > remaining:
batch = batch.slice(0, remaining)
layer_features_seen += batch.num_rows
total_features_seen += batch.num_rows
names = batch.schema.names
area = np.asarray(
batch.column(names.index("TOW_Area_M")).to_numpy(zero_copy_only=False),
dtype=np.float64,
)
height = np.asarray(
batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
dtype=np.float64,
)
geometry = np.asarray(
batch.column(names.index("SHAPE")).to_numpy(zero_copy_only=False),
dtype=object,
)
valid = np.isfinite(area) & (area > 0)
if not valid.any():
continue
geometry = geometry[valid]
area = area[valid]
height = height[valid]
centroids = shapely.centroid(shapely.from_wkb(geometry))
x = shapely.get_x(centroids)
y = shapely.get_y(centroids)
valid_xy = np.isfinite(x) & np.isfinite(y)
if not valid_xy.any():
continue
x = x[valid_xy]
y = y[valid_xy]
area = area[valid_xy]
height = height[valid_xy]
nearby = tree.query_ball_point(
np.column_stack((x, y)), radius_m, workers=workers
)
lengths = np.fromiter(
(len(postcode_indexes) for postcode_indexes in nearby),
dtype=np.int32,
count=len(nearby),
)
matching_features = lengths > 0
if matching_features.any():
postcode_indexes = np.concatenate(
[indexes for indexes in nearby if indexes]
).astype(np.int64, copy=False)
feature_indexes = np.repeat(
np.flatnonzero(matching_features), lengths[matching_features]
)
np.add.at(canopy_area, postcode_indexes, area[feature_indexes])
np.add.at(feature_count, postcode_indexes, 1)
feature_height = height[feature_indexes]
valid_height = np.isfinite(feature_height)
if valid_height.any():
height_area = area[feature_indexes][valid_height]
np.add.at(
height_weighted_sum,
postcode_indexes[valid_height],
feature_height[valid_height] * height_area,
)
np.add.at(
height_weight,
postcode_indexes[valid_height],
height_area,
)
total_features_used += len(area)
if batch_index == 1 or batch_index % 25 == 0:
print(
f" batch {batch_index:,}: "
f"{total_features_seen:,} rows read, "
f"{total_features_used:,} features with usable centroids"
)
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
buffer_area = math.pi * radius_m * radius_m
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
mean_height = np.divide(
height_weighted_sum,
height_weight,
out=np.full(n_points, np.nan, dtype=np.float64),
where=height_weight > 0,
)
return pl.DataFrame(
{
"postcode": points["postcode"],
area_col: canopy_area.round(1).astype(np.float32),
density_col: density_pct.round(1).astype(np.float32),
count_col: feature_count.astype(np.uint32),
height_col: np.round(mean_height, 1).astype(np.float32),
}
).with_columns(
pl.col(height_col).fill_nan(None),
)
def _clean_key_expr(column: str) -> pl.Expr:
return (
pl.col(column)
.fill_null("")
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
)
def _latest_price_paid_addresses(price_paid_path: Path) -> pl.LazyFrame:
return (
pl.scan_parquet(price_paid_path)
.select(
pl.col("postcode").str.strip_chars().str.to_uppercase().alias("postcode"),
"paon",
"saon",
"street",
"locality",
"town_city",
"district",
"county",
"date_of_transfer",
)
.filter(pl.col("postcode").is_not_null())
.filter(pl.col("street").is_not_null())
.filter(_clean_key_expr("street") != "")
.with_columns(
pl.concat_str(
[pl.col("saon"), pl.col("paon"), pl.col("street")],
separator=" ",
ignore_nulls=True,
)
.str.replace_all(r"\s+", " ")
.str.strip_chars()
.alias("pp_address"),
)
.filter(pl.col("pp_address").is_not_null())
.sort("date_of_transfer")
.group_by("postcode", "pp_address", maintain_order=True)
.agg(
pl.col("street").last(),
pl.col("locality").last(),
pl.col("town_city").last(),
pl.col("district").last(),
pl.col("county").last(),
)
.with_columns(
pl.concat_str(
[
_clean_key_expr("street"),
_clean_key_expr("town_city"),
_clean_key_expr("district"),
_clean_key_expr("county"),
],
separator="|",
).alias("street_key")
)
)
def _weighted_mean_expr(column: str, weight: str) -> pl.Expr:
valid = pl.col(column).is_not_null() & ~pl.col(column).is_nan()
numerator = pl.when(valid).then(pl.col(column) * pl.col(weight)).sum()
denominator = pl.when(valid).then(pl.col(weight)).sum()
return pl.when(denominator > 0).then(numerator / denominator).otherwise(None)
def _write_street_rollups(
postcode_metrics: pl.DataFrame,
price_paid_path: Path,
output_streets: Path | None,
output_addresses: Path | None,
radius_m: int,
) -> None:
if output_streets is None and output_addresses is None:
return
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
metrics = postcode_metrics.lazy()
addresses = _latest_price_paid_addresses(price_paid_path).join(
metrics, on="postcode", how="inner"
)
per_postcode = (
addresses.group_by(
"street_key",
"postcode",
"street",
"locality",
"town_city",
"district",
"county",
)
.agg(
pl.len().alias("address_count"),
pl.col(density_col).first(),
pl.col(area_col).first(),
pl.col(count_col).first(),
pl.col(height_col).first(),
)
.collect()
)
streets = (
per_postcode.lazy()
.group_by("street_key")
.agg(
pl.col("street").first(),
pl.col("locality").first(),
pl.col("town_city").first(),
pl.col("district").first(),
pl.col("county").first(),
pl.col("postcode").n_unique().alias("postcode_count"),
pl.col("address_count").sum().alias("address_count"),
_weighted_mean_expr(density_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(STREET_TREE_COVERAGE_COL),
_weighted_mean_expr(area_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(f"Street average {area_col}"),
_weighted_mean_expr(count_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(f"Street average {count_col}"),
_weighted_mean_expr(height_col, "address_count")
.round(1)
.cast(pl.Float32)
.alias(f"Street average {height_col}"),
)
.with_columns(
_coverage_percentile_expr(
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
)
)
.sort("street_key")
.collect()
)
if output_addresses is not None:
output_addresses.parent.mkdir(parents=True, exist_ok=True)
address_output = addresses.join(
streets.lazy().select(
"street_key",
STREET_TREE_COVERAGE_COL,
STREET_TREE_DENSITY_COL,
),
on="street_key",
how="left",
)
address_output.sink_parquet(output_addresses, compression="zstd")
print(f"Wrote address tree-density join: {output_addresses}")
if output_streets is not None:
output_streets.parent.mkdir(parents=True, exist_ok=True)
streets.write_parquet(output_streets, compression="zstd")
print(f"Wrote street tree-density rollup: {output_streets}")
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
if value is None:
return None
if value.lower() == "all":
return None
parts = tuple(part.strip() for part in value.split(",") if part.strip())
return parts or None
def main() -> None:
parser = argparse.ArgumentParser(
description="Build postcode and street tree-density metrics from FR_TOW_V1_ALL.zip"
)
parser.add_argument(
"--tow-zip",
type=Path,
default=Path("property-data/FR_TOW_V1_ALL.zip"),
help="Forest Research TOW zip containing FR_TOW_V1_ALL.gdb",
)
parser.add_argument(
"--extract-dir",
type=Path,
default=Path("property-data/fr_tow_v1_all"),
help="Directory where the zip is extracted",
)
parser.add_argument(
"--force-extract",
action="store_true",
help="Re-extract the TOW zip even if the geodatabase already exists",
)
parser.add_argument(
"--use-vsizip",
action="store_true",
help="Read the geodatabase directly from the zip instead of extracting it",
)
parser.add_argument(
"--arcgis",
type=Path,
default=Path("property-data/arcgis_data.parquet"),
help="Postcode centroid parquet with east1m/north1m columns",
)
parser.add_argument(
"--price-paid",
type=Path,
default=None,
help="Optional Price Paid parquet used to roll postcode metrics up to streets",
)
parser.add_argument(
"--output-postcodes",
type=Path,
required=True,
help="Output postcode-level tree-density parquet",
)
parser.add_argument(
"--output-streets",
type=Path,
default=None,
help="Optional output street-level tree-density parquet",
)
parser.add_argument(
"--output-addresses",
type=Path,
default=None,
help="Optional output address/street join parquet keyed by postcode and pp_address",
)
parser.add_argument(
"--radius-m",
type=int,
default=50,
help="Radius around each postcode centroid used as the street-scale buffer",
)
parser.add_argument(
"--tow-types",
default=",".join(DEFAULT_TOW_TYPES),
help='Comma-separated Woodland_Type values to include, or "all"',
)
parser.add_argument(
"--layers",
default=None,
help="Optional comma-separated subset of TOW layers for testing",
)
parser.add_argument(
"--batch-size",
type=int,
default=65_536,
help="Arrow batch size for reading TOW features",
)
parser.add_argument(
"--workers",
type=int,
default=-1,
help="Worker count passed to scipy cKDTree.query_ball_point",
)
parser.add_argument(
"--max-postcodes",
type=int,
default=None,
help="Testing only: process the first N postcode points",
)
parser.add_argument(
"--max-features-per-layer",
type=int,
default=None,
help="Testing only: process at most N TOW features per layer",
)
args = parser.parse_args()
if (args.output_streets or args.output_addresses) and args.price_paid is None:
raise SystemExit("--price-paid is required when writing street/address outputs")
if args.radius_m <= 0:
raise SystemExit("--radius-m must be greater than zero")
dataset_path = _tow_dataset_path(
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
)
points = _postcode_points(args.arcgis, args.max_postcodes)
tow_types = _parse_csv_arg(args.tow_types)
layer_names = _parse_csv_arg(args.layers)
postcode_metrics = _accumulate_tree_metrics(
dataset_path=dataset_path,
points=points,
radius_m=args.radius_m,
tow_types=tow_types,
batch_size=args.batch_size,
layer_names=layer_names,
max_features_per_layer=args.max_features_per_layer,
workers=args.workers,
)
postcode_metrics = _with_postcode_density_percentiles(
postcode_metrics, args.radius_m
)
args.output_postcodes.parent.mkdir(parents=True, exist_ok=True)
postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")
if args.price_paid is not None:
_write_street_rollups(
postcode_metrics=postcode_metrics,
price_paid_path=args.price_paid,
output_streets=args.output_streets,
output_addresses=args.output_addresses,
radius_m=args.radius_m,
)
if __name__ == "__main__":
main()