lgtm
This commit is contained in:
parent
8708bf000d
commit
11711c57e6
38 changed files with 5361 additions and 265 deletions
|
|
@ -1,6 +1,8 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_AREA_COLUMNS,
|
||||
_STATIC_POI_DISTANCE_RENAMES,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
)
|
||||
|
|
@ -27,7 +29,20 @@ def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
|
|||
|
||||
|
||||
def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
||||
assert _is_dynamic_poi_metric_column("Distance to nearest Cafe POI (km)")
|
||||
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of Cafe POIs within 5km")
|
||||
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
|
||||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
|
||||
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
||||
|
||||
|
||||
def test_static_poi_distance_columns_are_renamed_to_configured_area_features() -> None:
|
||||
expected = {
|
||||
"parks_nearest_km": "Distance to nearest park (km)",
|
||||
"grocery_store_nearest_km": "Distance to nearest grocery store (km)",
|
||||
"cafe_nearest_km": "Distance to nearest cafe (km)",
|
||||
"pub_nearest_km": "Distance to nearest pub (km)",
|
||||
"restaurant_nearest_km": "Distance to nearest restaurant (km)",
|
||||
}
|
||||
|
||||
assert _STATIC_POI_DISTANCE_RENAMES == expected
|
||||
assert set(expected.values()).issubset(_AREA_COLUMNS)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.poi_proximity import _build_poi_category_groups
|
||||
from pipeline.transform.poi_proximity import (
|
||||
_build_poi_category_groups,
|
||||
_dynamic_poi_metric_renames,
|
||||
)
|
||||
|
||||
|
||||
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
||||
|
|
@ -11,6 +14,7 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
|||
+ ["Leisure"] * 2
|
||||
+ ["Groceries"] * 101
|
||||
+ ["Groceries"] * 100
|
||||
+ ["Leisure"] * 10
|
||||
+ ["Education"] * 200
|
||||
+ ["Health"] * 200
|
||||
),
|
||||
|
|
@ -19,11 +23,12 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
|||
+ ["Café", "Restaurant"]
|
||||
+ ["Tesco"] * 101
|
||||
+ ["Waitrose"] * 100
|
||||
+ ["Park"] * 10
|
||||
+ ["School"] * 200
|
||||
+ ["Pharmacy"] * 200
|
||||
),
|
||||
"lat": [51.5] * 605,
|
||||
"lng": [-0.1] * 605,
|
||||
"lat": [51.5] * 615,
|
||||
"lng": [-0.1] * 615,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -37,5 +42,14 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
|||
"Tesco",
|
||||
}
|
||||
assert "poi_waitrose" not in groups
|
||||
assert "poi_park" not in groups
|
||||
assert "poi_school" not in groups
|
||||
assert "poi_pharmacy" not in groups
|
||||
|
||||
|
||||
def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
|
||||
assert _dynamic_poi_metric_renames({"parks": "Park"}) == {
|
||||
"parks_nearest_km": "Distance to nearest amenity (Park) (km)",
|
||||
"parks_2km": "Number of amenities (Park) within 2km",
|
||||
"parks_5km": "Number of amenities (Park) within 5km",
|
||||
}
|
||||
|
|
|
|||
99
pipeline/transform/test_tree_density.py
Normal file
99
pipeline/transform/test_tree_density.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.transform.tree_density import (
|
||||
STREET_TREE_COVERAGE_COL,
|
||||
STREET_TREE_DENSITY_COL,
|
||||
_coverage_percentile_expr,
|
||||
_metric_columns,
|
||||
_postcode_density_percentile_col,
|
||||
_with_postcode_density_percentiles,
|
||||
_write_street_rollups,
|
||||
)
|
||||
|
||||
|
||||
def test_coverage_percentile_expr_ranks_higher_coverage_higher() -> None:
|
||||
df = pl.DataFrame({"coverage": [0.0, 5.0, 10.0, None]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_coverage_percentile_expr("coverage", "percentile")
|
||||
).collect()
|
||||
|
||||
assert result["percentile"].to_list() == [0.0, 50.0, 100.0, None]
|
||||
|
||||
|
||||
def test_coverage_percentile_expr_uses_exact_scale_endpoints() -> None:
|
||||
df = pl.DataFrame({"coverage": [0.0, 0.0, 5.0, 10.0, 10.0]})
|
||||
|
||||
result = df.lazy().with_columns(
|
||||
_coverage_percentile_expr("coverage", "percentile")
|
||||
).collect()
|
||||
|
||||
assert result["percentile"].to_list() == [0.0, 0.0, 50.0, 100.0, 100.0]
|
||||
|
||||
|
||||
def test_street_rollup_percentiles_are_ranked_over_raw_street_coverage(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
radius_m = 50
|
||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||
percentile_col = _postcode_density_percentile_col(radius_m)
|
||||
|
||||
postcode_metrics = _with_postcode_density_percentiles(
|
||||
pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "AA1 1AB", "AA1 1AC"],
|
||||
density_col: [10.0, 30.0, 50.0],
|
||||
area_col: [100.0, 300.0, 500.0],
|
||||
count_col: [1, 3, 5],
|
||||
height_col: [4.0, 6.0, 8.0],
|
||||
}
|
||||
),
|
||||
radius_m,
|
||||
)
|
||||
|
||||
price_paid = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AB", "AA1 1AC"],
|
||||
"paon": ["1", "2", "3", "4"],
|
||||
"saon": ["", "", "", ""],
|
||||
"street": ["Oak Road", "Oak Road", "Oak Road", "Elm Street"],
|
||||
"locality": ["", "", "", ""],
|
||||
"town_city": ["Test Town", "Test Town", "Test Town", "Test Town"],
|
||||
"district": ["Test District"] * 4,
|
||||
"county": ["Test County"] * 4,
|
||||
"date_of_transfer": [
|
||||
"2024-01-01",
|
||||
"2024-01-02",
|
||||
"2024-01-03",
|
||||
"2024-01-04",
|
||||
],
|
||||
}
|
||||
)
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
output_streets = tmp_path / "streets.parquet"
|
||||
output_addresses = tmp_path / "addresses.parquet"
|
||||
price_paid.write_parquet(price_paid_path)
|
||||
|
||||
_write_street_rollups(
|
||||
postcode_metrics=postcode_metrics,
|
||||
price_paid_path=price_paid_path,
|
||||
output_streets=output_streets,
|
||||
output_addresses=output_addresses,
|
||||
radius_m=radius_m,
|
||||
)
|
||||
|
||||
streets = pl.read_parquet(output_streets).sort("street")
|
||||
addresses = pl.read_parquet(output_addresses)
|
||||
|
||||
assert streets["street"].to_list() == ["Elm Street", "Oak Road"]
|
||||
assert streets[STREET_TREE_COVERAGE_COL].to_list() == pytest.approx([50.0, 16.7])
|
||||
assert streets.select("street", STREET_TREE_DENSITY_COL).rows() == [
|
||||
("Elm Street", 100.0),
|
||||
("Oak Road", 0.0),
|
||||
]
|
||||
assert percentile_col in addresses.columns
|
||||
assert STREET_TREE_COVERAGE_COL in addresses.columns
|
||||
assert STREET_TREE_DENSITY_COL in addresses.columns
|
||||
635
pipeline/transform/tree_density.py
Normal file
635
pipeline/transform/tree_density.py
Normal file
|
|
@ -0,0 +1,635 @@
|
|||
"""Derive street-scale tree density metrics from Forest Research TOW data.
|
||||
|
||||
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
|
||||
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
|
||||
postcode-level metric from the tree polygons, then optionally rolls that up to
|
||||
Price Paid street names so the dashboard can answer "what is this address's
|
||||
street like?" without loading the full geodatabase at runtime.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import math
|
||||
import shutil
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pyogrio
|
||||
import shapely
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
|
||||
DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
|
||||
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
|
||||
STREET_TREE_DENSITY_COL = "Street tree density percentile"
|
||||
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
|
||||
POSTCODE_DENSITY_COL = "Tree canopy density within {radius}m (%)"
|
||||
POSTCODE_DENSITY_PERCENTILE_COL = "Tree canopy density percentile within {radius}m"
|
||||
POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
|
||||
POSTCODE_COUNT_COL = "Tree features within {radius}m"
|
||||
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
|
||||
|
||||
|
||||
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
||||
"""Extract the TOW zip and return the extracted .gdb path."""
|
||||
gdb_path = extract_dir / TOW_GDB_NAME
|
||||
if gdb_path.exists() and not force:
|
||||
print(f"Using existing extracted geodatabase: {gdb_path}")
|
||||
return gdb_path
|
||||
|
||||
if force and extract_dir.exists():
|
||||
shutil.rmtree(extract_dir)
|
||||
elif extract_dir.exists():
|
||||
print(f"Removing incomplete extraction directory: {extract_dir}")
|
||||
shutil.rmtree(extract_dir)
|
||||
|
||||
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
|
||||
if tmp_dir.exists():
|
||||
shutil.rmtree(tmp_dir)
|
||||
tmp_dir.mkdir(parents=True)
|
||||
|
||||
root = tmp_dir.resolve()
|
||||
print(f"Extracting {zip_path} to {extract_dir}...")
|
||||
with zipfile.ZipFile(zip_path) as archive:
|
||||
for member in archive.infolist():
|
||||
target = (tmp_dir / member.filename).resolve()
|
||||
if root != target and root not in target.parents:
|
||||
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
|
||||
|
||||
if member.is_dir():
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with archive.open(member) as source, target.open("wb") as dest:
|
||||
shutil.copyfileobj(source, dest, length=1024 * 1024)
|
||||
|
||||
if not (tmp_dir / TOW_GDB_NAME).exists():
|
||||
raise FileNotFoundError(f"{TOW_GDB_NAME} was not found inside {zip_path}")
|
||||
|
||||
tmp_dir.rename(extract_dir)
|
||||
print(f"Extracted geodatabase: {gdb_path}")
|
||||
return gdb_path
|
||||
|
||||
|
||||
def _tow_dataset_path(
|
||||
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
|
||||
) -> str:
|
||||
if use_vsizip:
|
||||
return f"/vsizip/{zip_path.resolve()}/{TOW_GDB_NAME}"
|
||||
return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
|
||||
|
||||
|
||||
def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None:
|
||||
if not tow_types:
|
||||
return None
|
||||
escaped = [tow_type.replace("'", "''") for tow_type in tow_types]
|
||||
values = ", ".join(f"'{tow_type}'" for tow_type in escaped)
|
||||
return f"Woodland_Type IN ({values})"
|
||||
|
||||
|
||||
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
|
||||
points = (
|
||||
pl.scan_parquet(arcgis_path)
|
||||
.filter(pl.col("ctry25cd") == "E92000001")
|
||||
.filter(pl.col("doterm").is_null())
|
||||
.select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
pl.col("east1m").cast(pl.Float64).alias("x"),
|
||||
pl.col("north1m").cast(pl.Float64).alias("y"),
|
||||
)
|
||||
.drop_nulls(["postcode", "x", "y"])
|
||||
.unique("postcode")
|
||||
.sort("postcode")
|
||||
)
|
||||
if max_postcodes is not None:
|
||||
points = points.head(max_postcodes)
|
||||
df = points.collect()
|
||||
print(f"Loaded {df.height:,} active English postcode points")
|
||||
return df
|
||||
|
||||
|
||||
def _layers(dataset_path: str, selected_layers: tuple[str, ...] | None) -> list[str]:
|
||||
available = [layer for layer, _geometry_type in pyogrio.list_layers(dataset_path)]
|
||||
if selected_layers is None:
|
||||
return available
|
||||
|
||||
missing = sorted(set(selected_layers) - set(available))
|
||||
if missing:
|
||||
raise ValueError(f"Unknown TOW layer(s): {', '.join(missing)}")
|
||||
return [layer for layer in available if layer in selected_layers]
|
||||
|
||||
|
||||
def _metric_columns(radius_m: int) -> tuple[str, str, str, str]:
|
||||
return (
|
||||
POSTCODE_DENSITY_COL.format(radius=radius_m),
|
||||
POSTCODE_AREA_COL.format(radius=radius_m),
|
||||
POSTCODE_COUNT_COL.format(radius=radius_m),
|
||||
POSTCODE_HEIGHT_COL.format(radius=radius_m),
|
||||
)
|
||||
|
||||
|
||||
def _postcode_density_percentile_col(radius_m: int) -> str:
|
||||
return POSTCODE_DENSITY_PERCENTILE_COL.format(radius=radius_m)
|
||||
|
||||
|
||||
def _coverage_percentile_expr(column: str, alias: str) -> pl.Expr:
|
||||
"""Rank higher tree coverage higher on a 0-100 England-wide percentile scale."""
|
||||
value = pl.col(column).fill_nan(None)
|
||||
non_null_count = value.count()
|
||||
rank = value.rank("average")
|
||||
return (
|
||||
pl.when(value.is_null())
|
||||
.then(None)
|
||||
.when(value == value.min())
|
||||
.then(0.0)
|
||||
.when(value == value.max())
|
||||
.then(100.0)
|
||||
.when(non_null_count > 1)
|
||||
.then(((rank - 1) / (non_null_count - 1) * 100).round(1))
|
||||
.otherwise(100.0)
|
||||
.cast(pl.Float32)
|
||||
.alias(alias)
|
||||
)
|
||||
|
||||
|
||||
def _with_postcode_density_percentiles(
|
||||
postcode_metrics: pl.DataFrame, radius_m: int
|
||||
) -> pl.DataFrame:
|
||||
density_col, _area_col, _count_col, _height_col = _metric_columns(radius_m)
|
||||
return postcode_metrics.with_columns(
|
||||
_coverage_percentile_expr(
|
||||
density_col,
|
||||
_postcode_density_percentile_col(radius_m),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _accumulate_tree_metrics(
|
||||
dataset_path: str,
|
||||
points: pl.DataFrame,
|
||||
radius_m: int,
|
||||
tow_types: tuple[str, ...] | None,
|
||||
batch_size: int,
|
||||
layer_names: tuple[str, ...] | None,
|
||||
max_features_per_layer: int | None,
|
||||
workers: int,
|
||||
) -> pl.DataFrame:
|
||||
xy = points.select("x", "y").to_numpy()
|
||||
tree = cKDTree(xy)
|
||||
n_points = points.height
|
||||
|
||||
canopy_area = np.zeros(n_points, dtype=np.float64)
|
||||
feature_count = np.zeros(n_points, dtype=np.uint32)
|
||||
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
||||
height_weight = np.zeros(n_points, dtype=np.float64)
|
||||
|
||||
where = _where_for_tow_types(tow_types)
|
||||
layers = _layers(dataset_path, layer_names)
|
||||
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
||||
if where:
|
||||
print(f"TOW type filter: {where}")
|
||||
|
||||
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
|
||||
total_features_seen = 0
|
||||
total_features_used = 0
|
||||
|
||||
for layer in layers:
|
||||
info = pyogrio.read_info(dataset_path, layer=layer)
|
||||
print(f"\nLayer {layer}: {info.get('features', 0):,} features")
|
||||
layer_features_seen = 0
|
||||
|
||||
with pyogrio.open_arrow(
|
||||
dataset_path,
|
||||
layer=layer,
|
||||
columns=columns,
|
||||
where=where,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=True,
|
||||
) as (_meta, reader):
|
||||
for batch_index, batch in enumerate(reader, start=1):
|
||||
if max_features_per_layer is not None:
|
||||
remaining = max_features_per_layer - layer_features_seen
|
||||
if remaining <= 0:
|
||||
break
|
||||
if batch.num_rows > remaining:
|
||||
batch = batch.slice(0, remaining)
|
||||
|
||||
layer_features_seen += batch.num_rows
|
||||
total_features_seen += batch.num_rows
|
||||
names = batch.schema.names
|
||||
area = np.asarray(
|
||||
batch.column(names.index("TOW_Area_M")).to_numpy(zero_copy_only=False),
|
||||
dtype=np.float64,
|
||||
)
|
||||
height = np.asarray(
|
||||
batch.column(names.index("MEANHT")).to_numpy(zero_copy_only=False),
|
||||
dtype=np.float64,
|
||||
)
|
||||
geometry = np.asarray(
|
||||
batch.column(names.index("SHAPE")).to_numpy(zero_copy_only=False),
|
||||
dtype=object,
|
||||
)
|
||||
|
||||
valid = np.isfinite(area) & (area > 0)
|
||||
if not valid.any():
|
||||
continue
|
||||
|
||||
geometry = geometry[valid]
|
||||
area = area[valid]
|
||||
height = height[valid]
|
||||
|
||||
centroids = shapely.centroid(shapely.from_wkb(geometry))
|
||||
x = shapely.get_x(centroids)
|
||||
y = shapely.get_y(centroids)
|
||||
valid_xy = np.isfinite(x) & np.isfinite(y)
|
||||
if not valid_xy.any():
|
||||
continue
|
||||
|
||||
x = x[valid_xy]
|
||||
y = y[valid_xy]
|
||||
area = area[valid_xy]
|
||||
height = height[valid_xy]
|
||||
|
||||
nearby = tree.query_ball_point(
|
||||
np.column_stack((x, y)), radius_m, workers=workers
|
||||
)
|
||||
lengths = np.fromiter(
|
||||
(len(postcode_indexes) for postcode_indexes in nearby),
|
||||
dtype=np.int32,
|
||||
count=len(nearby),
|
||||
)
|
||||
matching_features = lengths > 0
|
||||
if matching_features.any():
|
||||
postcode_indexes = np.concatenate(
|
||||
[indexes for indexes in nearby if indexes]
|
||||
).astype(np.int64, copy=False)
|
||||
feature_indexes = np.repeat(
|
||||
np.flatnonzero(matching_features), lengths[matching_features]
|
||||
)
|
||||
|
||||
np.add.at(canopy_area, postcode_indexes, area[feature_indexes])
|
||||
np.add.at(feature_count, postcode_indexes, 1)
|
||||
|
||||
feature_height = height[feature_indexes]
|
||||
valid_height = np.isfinite(feature_height)
|
||||
if valid_height.any():
|
||||
height_area = area[feature_indexes][valid_height]
|
||||
np.add.at(
|
||||
height_weighted_sum,
|
||||
postcode_indexes[valid_height],
|
||||
feature_height[valid_height] * height_area,
|
||||
)
|
||||
np.add.at(
|
||||
height_weight,
|
||||
postcode_indexes[valid_height],
|
||||
height_area,
|
||||
)
|
||||
|
||||
total_features_used += len(area)
|
||||
if batch_index == 1 or batch_index % 25 == 0:
|
||||
print(
|
||||
f" batch {batch_index:,}: "
|
||||
f"{total_features_seen:,} rows read, "
|
||||
f"{total_features_used:,} features with usable centroids"
|
||||
)
|
||||
|
||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||
buffer_area = math.pi * radius_m * radius_m
|
||||
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
|
||||
mean_height = np.divide(
|
||||
height_weighted_sum,
|
||||
height_weight,
|
||||
out=np.full(n_points, np.nan, dtype=np.float64),
|
||||
where=height_weight > 0,
|
||||
)
|
||||
|
||||
return pl.DataFrame(
|
||||
{
|
||||
"postcode": points["postcode"],
|
||||
area_col: canopy_area.round(1).astype(np.float32),
|
||||
density_col: density_pct.round(1).astype(np.float32),
|
||||
count_col: feature_count.astype(np.uint32),
|
||||
height_col: np.round(mean_height, 1).astype(np.float32),
|
||||
}
|
||||
).with_columns(
|
||||
pl.col(height_col).fill_nan(None),
|
||||
)
|
||||
|
||||
|
||||
def _clean_key_expr(column: str) -> pl.Expr:
|
||||
return (
|
||||
pl.col(column)
|
||||
.fill_null("")
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^A-Z0-9]+", " ")
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
)
|
||||
|
||||
|
||||
def _latest_price_paid_addresses(price_paid_path: Path) -> pl.LazyFrame:
|
||||
return (
|
||||
pl.scan_parquet(price_paid_path)
|
||||
.select(
|
||||
pl.col("postcode").str.strip_chars().str.to_uppercase().alias("postcode"),
|
||||
"paon",
|
||||
"saon",
|
||||
"street",
|
||||
"locality",
|
||||
"town_city",
|
||||
"district",
|
||||
"county",
|
||||
"date_of_transfer",
|
||||
)
|
||||
.filter(pl.col("postcode").is_not_null())
|
||||
.filter(pl.col("street").is_not_null())
|
||||
.filter(_clean_key_expr("street") != "")
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
separator=" ",
|
||||
ignore_nulls=True,
|
||||
)
|
||||
.str.replace_all(r"\s+", " ")
|
||||
.str.strip_chars()
|
||||
.alias("pp_address"),
|
||||
)
|
||||
.filter(pl.col("pp_address").is_not_null())
|
||||
.sort("date_of_transfer")
|
||||
.group_by("postcode", "pp_address", maintain_order=True)
|
||||
.agg(
|
||||
pl.col("street").last(),
|
||||
pl.col("locality").last(),
|
||||
pl.col("town_city").last(),
|
||||
pl.col("district").last(),
|
||||
pl.col("county").last(),
|
||||
)
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[
|
||||
_clean_key_expr("street"),
|
||||
_clean_key_expr("town_city"),
|
||||
_clean_key_expr("district"),
|
||||
_clean_key_expr("county"),
|
||||
],
|
||||
separator="|",
|
||||
).alias("street_key")
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _weighted_mean_expr(column: str, weight: str) -> pl.Expr:
|
||||
valid = pl.col(column).is_not_null() & ~pl.col(column).is_nan()
|
||||
numerator = pl.when(valid).then(pl.col(column) * pl.col(weight)).sum()
|
||||
denominator = pl.when(valid).then(pl.col(weight)).sum()
|
||||
return pl.when(denominator > 0).then(numerator / denominator).otherwise(None)
|
||||
|
||||
|
||||
def _write_street_rollups(
|
||||
postcode_metrics: pl.DataFrame,
|
||||
price_paid_path: Path,
|
||||
output_streets: Path | None,
|
||||
output_addresses: Path | None,
|
||||
radius_m: int,
|
||||
) -> None:
|
||||
if output_streets is None and output_addresses is None:
|
||||
return
|
||||
|
||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||
metrics = postcode_metrics.lazy()
|
||||
addresses = _latest_price_paid_addresses(price_paid_path).join(
|
||||
metrics, on="postcode", how="inner"
|
||||
)
|
||||
|
||||
per_postcode = (
|
||||
addresses.group_by(
|
||||
"street_key",
|
||||
"postcode",
|
||||
"street",
|
||||
"locality",
|
||||
"town_city",
|
||||
"district",
|
||||
"county",
|
||||
)
|
||||
.agg(
|
||||
pl.len().alias("address_count"),
|
||||
pl.col(density_col).first(),
|
||||
pl.col(area_col).first(),
|
||||
pl.col(count_col).first(),
|
||||
pl.col(height_col).first(),
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
streets = (
|
||||
per_postcode.lazy()
|
||||
.group_by("street_key")
|
||||
.agg(
|
||||
pl.col("street").first(),
|
||||
pl.col("locality").first(),
|
||||
pl.col("town_city").first(),
|
||||
pl.col("district").first(),
|
||||
pl.col("county").first(),
|
||||
pl.col("postcode").n_unique().alias("postcode_count"),
|
||||
pl.col("address_count").sum().alias("address_count"),
|
||||
_weighted_mean_expr(density_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(STREET_TREE_COVERAGE_COL),
|
||||
_weighted_mean_expr(area_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(f"Street average {area_col}"),
|
||||
_weighted_mean_expr(count_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(f"Street average {count_col}"),
|
||||
_weighted_mean_expr(height_col, "address_count")
|
||||
.round(1)
|
||||
.cast(pl.Float32)
|
||||
.alias(f"Street average {height_col}"),
|
||||
)
|
||||
.with_columns(
|
||||
_coverage_percentile_expr(
|
||||
STREET_TREE_COVERAGE_COL,
|
||||
STREET_TREE_DENSITY_COL,
|
||||
)
|
||||
)
|
||||
.sort("street_key")
|
||||
.collect()
|
||||
)
|
||||
|
||||
if output_addresses is not None:
|
||||
output_addresses.parent.mkdir(parents=True, exist_ok=True)
|
||||
address_output = addresses.join(
|
||||
streets.lazy().select(
|
||||
"street_key",
|
||||
STREET_TREE_COVERAGE_COL,
|
||||
STREET_TREE_DENSITY_COL,
|
||||
),
|
||||
on="street_key",
|
||||
how="left",
|
||||
)
|
||||
address_output.sink_parquet(output_addresses, compression="zstd")
|
||||
print(f"Wrote address tree-density join: {output_addresses}")
|
||||
|
||||
if output_streets is not None:
|
||||
output_streets.parent.mkdir(parents=True, exist_ok=True)
|
||||
streets.write_parquet(output_streets, compression="zstd")
|
||||
print(f"Wrote street tree-density rollup: {output_streets}")
|
||||
|
||||
|
||||
def _parse_csv_arg(value: str | None) -> tuple[str, ...] | None:
|
||||
if value is None:
|
||||
return None
|
||||
if value.lower() == "all":
|
||||
return None
|
||||
parts = tuple(part.strip() for part in value.split(",") if part.strip())
|
||||
return parts or None
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build postcode and street tree-density metrics from FR_TOW_V1_ALL.zip"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tow-zip",
|
||||
type=Path,
|
||||
default=Path("property-data/FR_TOW_V1_ALL.zip"),
|
||||
help="Forest Research TOW zip containing FR_TOW_V1_ALL.gdb",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract-dir",
|
||||
type=Path,
|
||||
default=Path("property-data/fr_tow_v1_all"),
|
||||
help="Directory where the zip is extracted",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-extract",
|
||||
action="store_true",
|
||||
help="Re-extract the TOW zip even if the geodatabase already exists",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-vsizip",
|
||||
action="store_true",
|
||||
help="Read the geodatabase directly from the zip instead of extracting it",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arcgis",
|
||||
type=Path,
|
||||
default=Path("property-data/arcgis_data.parquet"),
|
||||
help="Postcode centroid parquet with east1m/north1m columns",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--price-paid",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional Price Paid parquet used to roll postcode metrics up to streets",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-postcodes",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Output postcode-level tree-density parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-streets",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional output street-level tree-density parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-addresses",
|
||||
type=Path,
|
||||
default=None,
|
||||
help="Optional output address/street join parquet keyed by postcode and pp_address",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--radius-m",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Radius around each postcode centroid used as the street-scale buffer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tow-types",
|
||||
default=",".join(DEFAULT_TOW_TYPES),
|
||||
help='Comma-separated Woodland_Type values to include, or "all"',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--layers",
|
||||
default=None,
|
||||
help="Optional comma-separated subset of TOW layers for testing",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
type=int,
|
||||
default=65_536,
|
||||
help="Arrow batch size for reading TOW features",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Worker count passed to scipy cKDTree.query_ball_point",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-postcodes",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Testing only: process the first N postcode points",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-features-per-layer",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Testing only: process at most N TOW features per layer",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if (args.output_streets or args.output_addresses) and args.price_paid is None:
|
||||
raise SystemExit("--price-paid is required when writing street/address outputs")
|
||||
|
||||
if args.radius_m <= 0:
|
||||
raise SystemExit("--radius-m must be greater than zero")
|
||||
|
||||
dataset_path = _tow_dataset_path(
|
||||
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
|
||||
)
|
||||
points = _postcode_points(args.arcgis, args.max_postcodes)
|
||||
tow_types = _parse_csv_arg(args.tow_types)
|
||||
layer_names = _parse_csv_arg(args.layers)
|
||||
|
||||
postcode_metrics = _accumulate_tree_metrics(
|
||||
dataset_path=dataset_path,
|
||||
points=points,
|
||||
radius_m=args.radius_m,
|
||||
tow_types=tow_types,
|
||||
batch_size=args.batch_size,
|
||||
layer_names=layer_names,
|
||||
max_features_per_layer=args.max_features_per_layer,
|
||||
workers=args.workers,
|
||||
)
|
||||
postcode_metrics = _with_postcode_density_percentiles(
|
||||
postcode_metrics, args.radius_m
|
||||
)
|
||||
|
||||
args.output_postcodes.parent.mkdir(parents=True, exist_ok=True)
|
||||
postcode_metrics.write_parquet(args.output_postcodes, compression="zstd")
|
||||
print(f"\nWrote postcode tree-density metrics: {args.output_postcodes}")
|
||||
|
||||
if args.price_paid is not None:
|
||||
_write_street_rollups(
|
||||
postcode_metrics=postcode_metrics,
|
||||
price_paid_path=args.price_paid,
|
||||
output_streets=args.output_streets,
|
||||
output_addresses=args.output_addresses,
|
||||
radius_m=args.radius_m,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue