scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -1,10 +1,16 @@
|
|||
"""Derive street-scale tree density metrics from Forest Research TOW data.
|
||||
"""Derive street-scale tree density metrics from Forest Research TOW + NFI data.
|
||||
|
||||
The Forest Research Trees Outside Woodland release is an Esri File Geodatabase
|
||||
inside property-data/FR_TOW_V1_ALL.zip. This transformer computes a compact
|
||||
postcode-level metric from the tree polygons, then optionally rolls that up to
|
||||
Price Paid street names so the dashboard can answer "what is this address's
|
||||
street like?" without loading the full geodatabase at runtime.
|
||||
|
||||
TOW only covers trees *outside* woodland, so the National Forest Inventory (NFI)
|
||||
woodland layer is optionally unioned in. TOW canopy is accumulated by centroid
|
||||
proximity (tiny crowns), while large NFI woodland parcels are accumulated by
|
||||
true buffer-clipped intersection area so they cannot saturate a postcode from
|
||||
mere centroid proximity.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -22,7 +28,6 @@ import shapely
|
|||
from scipy.spatial import cKDTree
|
||||
|
||||
|
||||
DEFAULT_TOW_TYPES = ("Lone Tree", "Group of Trees")
|
||||
TOW_GDB_NAME = "FR_TOW_V1_ALL.gdb"
|
||||
STREET_TREE_DENSITY_COL = "Street tree density percentile"
|
||||
STREET_TREE_COVERAGE_COL = "Street tree coverage (%)"
|
||||
|
|
@ -32,6 +37,14 @@ POSTCODE_AREA_COL = "Tree canopy area within {radius}m (sqm)"
|
|||
POSTCODE_COUNT_COL = "Tree features within {radius}m"
|
||||
POSTCODE_HEIGHT_COL = "Mean TOW height within {radius}m (m)"
|
||||
|
||||
# National Forest Inventory (NFI) woodland — the geometric complement of TOW.
|
||||
# NFI ships as a zipped shapefile of woodland parcels (>=0.5 ha) in EPSG:27700.
|
||||
# Field names are from the NFI Woodland England 2022 release; re-check on bumps.
|
||||
NFI_CATEGORY_COL = "CATEGORY"
|
||||
NFI_WOODLAND_VALUE = "Woodland"
|
||||
NFI_TYPE_COL = "IFT_IOA"
|
||||
NFI_AREA_HA_COL = "Area_ha"
|
||||
|
||||
|
||||
def _safe_extract_zip(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
||||
"""Extract the TOW zip and return the extracted .gdb path."""
|
||||
|
|
@ -83,12 +96,60 @@ def _tow_dataset_path(
|
|||
return str(_safe_extract_zip(zip_path, extract_dir, force_extract))
|
||||
|
||||
|
||||
def _where_for_tow_types(tow_types: tuple[str, ...] | None) -> str | None:
|
||||
if not tow_types:
|
||||
return None
|
||||
escaped = [tow_type.replace("'", "''") for tow_type in tow_types]
|
||||
values = ", ".join(f"'{tow_type}'" for tow_type in escaped)
|
||||
return f"Woodland_Type IN ({values})"
|
||||
def _safe_extract_zip_dir(zip_path: Path, extract_dir: Path, force: bool) -> Path:
|
||||
"""Extract an arbitrary zip into extract_dir and return the directory."""
|
||||
if extract_dir.exists() and not force:
|
||||
print(f"Using existing extraction directory: {extract_dir}")
|
||||
return extract_dir
|
||||
if extract_dir.exists():
|
||||
shutil.rmtree(extract_dir)
|
||||
|
||||
tmp_dir = extract_dir.with_name(f".{extract_dir.name}.tmp")
|
||||
if tmp_dir.exists():
|
||||
shutil.rmtree(tmp_dir)
|
||||
tmp_dir.mkdir(parents=True)
|
||||
|
||||
root = tmp_dir.resolve()
|
||||
print(f"Extracting {zip_path} to {extract_dir}...")
|
||||
with zipfile.ZipFile(zip_path) as archive:
|
||||
for member in archive.infolist():
|
||||
target = (tmp_dir / member.filename).resolve()
|
||||
if root != target and root not in target.parents:
|
||||
raise ValueError(f"Unsafe path in zip archive: {member.filename}")
|
||||
if member.is_dir():
|
||||
target.mkdir(parents=True, exist_ok=True)
|
||||
continue
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
with archive.open(member) as source, target.open("wb") as dest:
|
||||
shutil.copyfileobj(source, dest, length=1024 * 1024)
|
||||
|
||||
tmp_dir.rename(extract_dir)
|
||||
print(f"Extracted archive: {extract_dir}")
|
||||
return extract_dir
|
||||
|
||||
|
||||
def _nfi_dataset_path(
|
||||
zip_path: Path, extract_dir: Path, force_extract: bool, use_vsizip: bool
|
||||
) -> str:
|
||||
"""Resolve the NFI woodland shapefile path, extracting the zip if needed."""
|
||||
if use_vsizip:
|
||||
return f"/vsizip/{zip_path.resolve()}"
|
||||
extracted = _safe_extract_zip_dir(zip_path, extract_dir, force_extract)
|
||||
shapefiles = sorted(extracted.rglob("*.shp"))
|
||||
if not shapefiles:
|
||||
raise FileNotFoundError(f"No .shp found inside {zip_path}")
|
||||
return str(shapefiles[0])
|
||||
|
||||
|
||||
def _geometry_column(metadata: dict, column_names: list[str]) -> str:
|
||||
"""Resolve the geometry column name from pyogrio Arrow metadata."""
|
||||
geometry_name = metadata.get("geometry_name")
|
||||
if geometry_name:
|
||||
return str(geometry_name)
|
||||
for name in ("wkb_geometry", "geometry", "geom"):
|
||||
if name in column_names:
|
||||
return name
|
||||
return column_names[-1]
|
||||
|
||||
|
||||
def _postcode_points(arcgis_path: Path, max_postcodes: int | None) -> pl.DataFrame:
|
||||
|
|
@ -172,26 +233,20 @@ def _accumulate_tree_metrics(
|
|||
dataset_path: str,
|
||||
points: pl.DataFrame,
|
||||
radius_m: int,
|
||||
tow_types: tuple[str, ...] | None,
|
||||
batch_size: int,
|
||||
layer_names: tuple[str, ...] | None,
|
||||
max_features_per_layer: int | None,
|
||||
workers: int,
|
||||
) -> pl.DataFrame:
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
height_weighted_sum: np.ndarray,
|
||||
height_weight: np.ndarray,
|
||||
) -> None:
|
||||
xy = points.select("x", "y").to_numpy()
|
||||
tree = cKDTree(xy)
|
||||
n_points = points.height
|
||||
|
||||
canopy_area = np.zeros(n_points, dtype=np.float64)
|
||||
feature_count = np.zeros(n_points, dtype=np.uint32)
|
||||
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
||||
height_weight = np.zeros(n_points, dtype=np.float64)
|
||||
|
||||
where = _where_for_tow_types(tow_types)
|
||||
layers = _layers(dataset_path, layer_names)
|
||||
print(f"Processing {len(layers)} TOW layer(s): {', '.join(layers)}")
|
||||
if where:
|
||||
print(f"TOW type filter: {where}")
|
||||
|
||||
columns = ["Woodland_Type", "TOW_Area_M", "MEANHT"]
|
||||
total_features_seen = 0
|
||||
|
|
@ -206,7 +261,6 @@ def _accumulate_tree_metrics(
|
|||
dataset_path,
|
||||
layer=layer,
|
||||
columns=columns,
|
||||
where=where,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=True,
|
||||
) as (_meta, reader):
|
||||
|
|
@ -297,6 +351,132 @@ def _accumulate_tree_metrics(
|
|||
f"{total_features_used:,} features with usable centroids"
|
||||
)
|
||||
|
||||
|
||||
def _postcode_buffers(
|
||||
points: pl.DataFrame, radius_m: int
|
||||
) -> tuple[np.ndarray, shapely.STRtree]:
|
||||
"""Build a radius-r circle for every postcode plus an STRtree over them.
|
||||
|
||||
Circle index == postcode index, matching the order used by the cKDTree path.
|
||||
"""
|
||||
xy = points.select("x", "y").to_numpy()
|
||||
circles = shapely.buffer(shapely.points(xy), radius_m, quad_segs=8)
|
||||
return circles, shapely.STRtree(circles)
|
||||
|
||||
|
||||
def _add_nfi_batch(
|
||||
geoms: np.ndarray,
|
||||
category: np.ndarray,
|
||||
circles: np.ndarray,
|
||||
tree: shapely.STRtree,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
radius_m: int,
|
||||
) -> None:
|
||||
"""Add NFI woodland into the shared arrays by true buffer-clipped area.
|
||||
|
||||
Unlike the TOW centroid path, this clips each woodland polygon to each
|
||||
nearby postcode circle and adds only area(polygon ∩ circle); a large parcel
|
||||
therefore cannot saturate a postcode from mere centroid proximity, and a
|
||||
buffer-filling parcel whose centroid is outside the radius is not missed.
|
||||
"""
|
||||
keep = (category == NFI_WOODLAND_VALUE) & ~shapely.is_missing(geoms)
|
||||
geoms = geoms[keep]
|
||||
if geoms.size:
|
||||
geoms = geoms[~shapely.is_empty(geoms)]
|
||||
if geoms.size == 0:
|
||||
return
|
||||
|
||||
# dwithin(polygon, point, r) is true iff the radius-r circle around the
|
||||
# point intersects the polygon -- exactly the candidate set we want.
|
||||
nfi_index, postcode_index = tree.query(
|
||||
geoms, predicate="dwithin", distance=radius_m
|
||||
)
|
||||
if nfi_index.size == 0:
|
||||
return
|
||||
|
||||
clipped_area = shapely.area(
|
||||
shapely.intersection(geoms[nfi_index], circles[postcode_index])
|
||||
)
|
||||
positive = clipped_area > 0
|
||||
postcode_index = postcode_index[positive]
|
||||
clipped_area = clipped_area[positive]
|
||||
|
||||
np.add.at(canopy_area, postcode_index, clipped_area)
|
||||
np.add.at(feature_count, postcode_index, 1)
|
||||
|
||||
|
||||
def _accumulate_nfi_metrics(
|
||||
dataset_path: str,
|
||||
circles: np.ndarray,
|
||||
tree: shapely.STRtree,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
radius_m: int,
|
||||
batch_size: int,
|
||||
max_nfi_features: int | None,
|
||||
) -> None:
|
||||
layers = _layers(dataset_path, None)
|
||||
print(f"Processing {len(layers)} NFI layer(s): {', '.join(layers)}")
|
||||
|
||||
# Density only needs the woodland flag + geometry; area is clipped from the
|
||||
# postcode buffer, not read from the file.
|
||||
columns = [NFI_CATEGORY_COL]
|
||||
features_seen = 0
|
||||
|
||||
for layer in layers:
|
||||
with pyogrio.open_arrow(
|
||||
dataset_path,
|
||||
layer=layer,
|
||||
columns=columns,
|
||||
batch_size=batch_size,
|
||||
use_pyarrow=True,
|
||||
) as (meta, reader):
|
||||
for batch_index, batch in enumerate(reader, start=1):
|
||||
if max_nfi_features is not None:
|
||||
remaining = max_nfi_features - features_seen
|
||||
if remaining <= 0:
|
||||
break
|
||||
if batch.num_rows > remaining:
|
||||
batch = batch.slice(0, remaining)
|
||||
|
||||
features_seen += batch.num_rows
|
||||
names = batch.schema.names
|
||||
geometry_column = _geometry_column(meta, names)
|
||||
category = np.asarray(
|
||||
batch.column(names.index(NFI_CATEGORY_COL)).to_numpy(
|
||||
zero_copy_only=False
|
||||
),
|
||||
dtype=object,
|
||||
)
|
||||
geometry = np.asarray(
|
||||
batch.column(names.index(geometry_column)).to_numpy(
|
||||
zero_copy_only=False
|
||||
),
|
||||
dtype=object,
|
||||
)
|
||||
_add_nfi_batch(
|
||||
shapely.from_wkb(geometry),
|
||||
category,
|
||||
circles,
|
||||
tree,
|
||||
canopy_area,
|
||||
feature_count,
|
||||
radius_m,
|
||||
)
|
||||
if batch_index == 1 or batch_index % 25 == 0:
|
||||
print(f" NFI batch {batch_index:,}: {features_seen:,} rows read")
|
||||
|
||||
|
||||
def _finalize_metrics(
|
||||
points: pl.DataFrame,
|
||||
canopy_area: np.ndarray,
|
||||
feature_count: np.ndarray,
|
||||
height_weighted_sum: np.ndarray,
|
||||
height_weight: np.ndarray,
|
||||
radius_m: int,
|
||||
) -> pl.DataFrame:
|
||||
n_points = points.height
|
||||
density_col, area_col, count_col, height_col = _metric_columns(radius_m)
|
||||
buffer_area = math.pi * radius_m * radius_m
|
||||
density_pct = np.minimum(canopy_area / buffer_area * 100.0, 100.0)
|
||||
|
|
@ -518,6 +698,18 @@ def main() -> None:
|
|||
action="store_true",
|
||||
help="Read the geodatabase directly from the zip instead of extracting it",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nfi-zip",
|
||||
type=Path,
|
||||
default=Path("property-data/NFI_WOODLAND_ENGLAND.zip"),
|
||||
help="Optional NFI woodland shapefile zip to union with TOW (skipped if absent)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--nfi-extract-dir",
|
||||
type=Path,
|
||||
default=Path("property-data/nfi_woodland_england"),
|
||||
help="Directory where the NFI zip is extracted",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arcgis",
|
||||
type=Path,
|
||||
|
|
@ -554,11 +746,6 @@ def main() -> None:
|
|||
default=50,
|
||||
help="Radius around each postcode centroid used as the street-scale buffer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tow-types",
|
||||
default=",".join(DEFAULT_TOW_TYPES),
|
||||
help='Comma-separated Woodland_Type values to include, or "all"',
|
||||
)
|
||||
parser.add_argument(
|
||||
"--layers",
|
||||
default=None,
|
||||
|
|
@ -588,6 +775,12 @@ def main() -> None:
|
|||
default=None,
|
||||
help="Testing only: process at most N TOW features per layer",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-nfi-features",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Testing only: process at most N NFI woodland features",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
if (args.output_streets or args.output_addresses) and args.price_paid is None:
|
||||
|
|
@ -600,18 +793,53 @@ def main() -> None:
|
|||
args.tow_zip, args.extract_dir, args.force_extract, args.use_vsizip
|
||||
)
|
||||
points = _postcode_points(args.arcgis, args.max_postcodes)
|
||||
tow_types = _parse_csv_arg(args.tow_types)
|
||||
layer_names = _parse_csv_arg(args.layers)
|
||||
|
||||
postcode_metrics = _accumulate_tree_metrics(
|
||||
n_points = points.height
|
||||
canopy_area = np.zeros(n_points, dtype=np.float64)
|
||||
feature_count = np.zeros(n_points, dtype=np.uint32)
|
||||
height_weighted_sum = np.zeros(n_points, dtype=np.float64)
|
||||
height_weight = np.zeros(n_points, dtype=np.float64)
|
||||
|
||||
_accumulate_tree_metrics(
|
||||
dataset_path=dataset_path,
|
||||
points=points,
|
||||
radius_m=args.radius_m,
|
||||
tow_types=tow_types,
|
||||
batch_size=args.batch_size,
|
||||
layer_names=layer_names,
|
||||
max_features_per_layer=args.max_features_per_layer,
|
||||
workers=args.workers,
|
||||
canopy_area=canopy_area,
|
||||
feature_count=feature_count,
|
||||
height_weighted_sum=height_weighted_sum,
|
||||
height_weight=height_weight,
|
||||
)
|
||||
|
||||
if args.nfi_zip is not None and args.nfi_zip.exists():
|
||||
nfi_path = _nfi_dataset_path(
|
||||
args.nfi_zip, args.nfi_extract_dir, args.force_extract, args.use_vsizip
|
||||
)
|
||||
circles, nfi_tree = _postcode_buffers(points, args.radius_m)
|
||||
_accumulate_nfi_metrics(
|
||||
dataset_path=nfi_path,
|
||||
circles=circles,
|
||||
tree=nfi_tree,
|
||||
canopy_area=canopy_area,
|
||||
feature_count=feature_count,
|
||||
radius_m=args.radius_m,
|
||||
batch_size=args.batch_size,
|
||||
max_nfi_features=args.max_nfi_features,
|
||||
)
|
||||
elif args.nfi_zip is not None:
|
||||
print(f"NFI zip not found, skipping woodland union: {args.nfi_zip}")
|
||||
|
||||
postcode_metrics = _finalize_metrics(
|
||||
points,
|
||||
canopy_area,
|
||||
feature_count,
|
||||
height_weighted_sum,
|
||||
height_weight,
|
||||
args.radius_m,
|
||||
)
|
||||
postcode_metrics = _with_postcode_density_percentiles(
|
||||
postcode_metrics, args.radius_m
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue