improve
This commit is contained in:
parent
8688b7475e
commit
e8345cbdc1
40 changed files with 1980 additions and 904 deletions
|
|
@ -1,10 +1,12 @@
|
|||
import json
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from pyproj import Transformer
|
||||
from shapely import make_valid
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from shapely import make_valid, set_precision
|
||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
from shapely.ops import unary_union
|
||||
from tqdm import tqdm
|
||||
|
||||
|
|
@ -18,49 +20,47 @@ def _get_to_wgs84():
|
|||
return _to_wgs84
|
||||
|
||||
|
||||
def _largest_polygonal(geom) -> Polygon | None:
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
if not geom.is_valid:
|
||||
geom = make_valid(geom)
|
||||
if geom.geom_type == "Polygon":
|
||||
return geom
|
||||
if geom.geom_type == "MultiPolygon":
|
||||
return max(geom.geoms, key=lambda g: g.area)
|
||||
if geom.geom_type == "GeometryCollection":
|
||||
polygons = [
|
||||
polygon
|
||||
for part in geom.geoms
|
||||
if (polygon := _largest_polygonal(part)) is not None
|
||||
]
|
||||
if polygons:
|
||||
return max(polygons, key=lambda g: g.area)
|
||||
return None
|
||||
|
||||
|
||||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return GeoJSON dict."""
|
||||
if geom.is_empty:
|
||||
geom = _largest_polygonal(geom)
|
||||
if geom is None:
|
||||
return None
|
||||
|
||||
simplified = geom.simplify(tolerance, preserve_topology=True)
|
||||
if simplified.is_empty:
|
||||
simplified = _largest_polygonal(simplified)
|
||||
if simplified is None:
|
||||
return None
|
||||
|
||||
transformer = _get_to_wgs84()
|
||||
|
||||
def transform_ring(coords):
|
||||
xs, ys = zip(*coords)
|
||||
lons, lats = transformer.transform(list(xs), list(ys))
|
||||
return [(round(lon, 6), round(lat, 6)) for lon, lat in zip(lons, lats)]
|
||||
|
||||
def transform_polygon(poly):
|
||||
exterior = transform_ring(poly.exterior.coords)
|
||||
holes = [transform_ring(h.coords) for h in poly.interiors]
|
||||
return [exterior] + holes
|
||||
|
||||
# Force single Polygon — postcodes are contiguous delivery routes
|
||||
if simplified.geom_type == "MultiPolygon":
|
||||
simplified = max(simplified.geoms, key=lambda g: g.area)
|
||||
elif simplified.geom_type == "GeometryCollection":
|
||||
polys = [
|
||||
g for g in simplified.geoms if g.geom_type in ("Polygon", "MultiPolygon")
|
||||
]
|
||||
if not polys:
|
||||
return None
|
||||
simplified = max(polys, key=lambda g: g.area)
|
||||
if simplified.geom_type == "MultiPolygon":
|
||||
simplified = max(simplified.geoms, key=lambda g: g.area)
|
||||
|
||||
if simplified.geom_type != "Polygon" or simplified.is_empty:
|
||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
wgs84 = _largest_polygonal(wgs84)
|
||||
if wgs84 is None:
|
||||
return None
|
||||
|
||||
return {
|
||||
"type": "Polygon",
|
||||
"coordinates": transform_polygon(simplified),
|
||||
}
|
||||
return mapping(wgs84)
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
|
|
@ -132,7 +132,11 @@ def write_district_geojson(
|
|||
) -> int:
|
||||
"""Group postcodes by district, write GeoJSON files. Returns file count."""
|
||||
units_dir = output_dir / "units"
|
||||
units_dir.mkdir(parents=True, exist_ok=True)
|
||||
tmp_units_dir = output_dir / "units.tmp"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
if tmp_units_dir.exists():
|
||||
shutil.rmtree(tmp_units_dir)
|
||||
tmp_units_dir.mkdir(parents=True)
|
||||
|
||||
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
||||
for pc, geom in postcodes.items():
|
||||
|
|
@ -141,14 +145,23 @@ def write_district_geojson(
|
|||
by_district[district].append((pc, geom))
|
||||
|
||||
file_count = 0
|
||||
seen_postcodes: set[str] = set()
|
||||
for district, entries in tqdm(
|
||||
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
||||
):
|
||||
features = []
|
||||
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
||||
if pc in seen_postcodes:
|
||||
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
|
||||
seen_postcodes.add(pc)
|
||||
geojson_geom = to_wgs84_geojson(geom)
|
||||
if geojson_geom is None:
|
||||
continue
|
||||
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
|
||||
written_geom = shape(geojson_geom)
|
||||
if written_geom.is_empty or not written_geom.is_valid:
|
||||
raise ValueError(
|
||||
f"Invalid postcode boundary geometry after output: {pc}"
|
||||
)
|
||||
mapit_code = pc.replace(" ", "")
|
||||
features.append(
|
||||
{
|
||||
|
|
@ -165,9 +178,12 @@ def write_district_geojson(
|
|||
continue
|
||||
|
||||
collection = {"type": "FeatureCollection", "features": features}
|
||||
out_path = units_dir / f"{district}.geojson"
|
||||
out_path = tmp_units_dir / f"{district}.geojson"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(collection, f, separators=(",", ":"))
|
||||
file_count += 1
|
||||
|
||||
if units_dir.exists():
|
||||
shutil.rmtree(units_dir)
|
||||
tmp_units_dir.replace(units_dir)
|
||||
return file_count
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@
|
|||
Each test targets a specific bug or edge case identified during code review.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
|
|
@ -11,7 +13,12 @@ from shapely.ops import unary_union
|
|||
|
||||
from .oa_boundaries import parse_gpkg_geometry
|
||||
from .greenspace import subtract_greenspace
|
||||
from .output import _fill_holes, merge_fragments, to_wgs84_geojson
|
||||
from .output import (
|
||||
_fill_holes,
|
||||
merge_fragments,
|
||||
to_wgs84_geojson,
|
||||
write_district_geojson,
|
||||
)
|
||||
from .process_oa import _extract_polygonal, process_oa
|
||||
from .uprn import get_oa_uprns, load_uprns
|
||||
from .voronoi import _equal_split_fallback, compute_voronoi_regions
|
||||
|
|
@ -154,6 +161,7 @@ class TestWhitespacePostcodes:
|
|||
"pcds": ["AA1 1AA", "AA1 1AB"],
|
||||
"east1m": [500010, 500030],
|
||||
"north1m": [180010, 180020],
|
||||
"oa21cd": ["E00000001", "E00000001"],
|
||||
"doterm": ["2020-01-01", None],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
}
|
||||
|
|
@ -165,6 +173,65 @@ class TestWhitespacePostcodes:
|
|||
|
||||
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
|
||||
|
||||
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
|
||||
uprns = pl.DataFrame(
|
||||
{
|
||||
"GRIDGB1E": [500010, 500020],
|
||||
"GRIDGB1N": [180010, 180020],
|
||||
"PCDS": ["AA1 1AA", "CF1 1AA"],
|
||||
"OA21CD": ["E00000001", "E00000001"],
|
||||
}
|
||||
)
|
||||
uprn_path = tmp_path / "uprn.parquet"
|
||||
uprns.write_parquet(uprn_path)
|
||||
arcgis = pl.DataFrame(
|
||||
{
|
||||
"pcds": ["AA1 1AA", "CF1 1AA"],
|
||||
"east1m": [500010, 300010],
|
||||
"north1m": [180010, 220010],
|
||||
"oa21cd": ["E00000001", "W00000001"],
|
||||
"doterm": [None, None],
|
||||
"ctry25cd": ["E92000001", "W92000004"],
|
||||
}
|
||||
)
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
arcgis.write_parquet(arcgis_path)
|
||||
|
||||
loaded_df, _offsets = load_uprns(uprn_path, arcgis_path)
|
||||
|
||||
assert loaded_df["PCDS"].to_list() == ["AA1 1AA"]
|
||||
|
||||
def test_arcgis_adds_centroid_seed_for_active_postcode_without_uprn(self, tmp_path):
|
||||
uprns = pl.DataFrame(
|
||||
{
|
||||
"GRIDGB1E": [500010],
|
||||
"GRIDGB1N": [180010],
|
||||
"PCDS": ["AA1 1AA"],
|
||||
"OA21CD": ["E00000001"],
|
||||
}
|
||||
)
|
||||
uprn_path = tmp_path / "uprn.parquet"
|
||||
uprns.write_parquet(uprn_path)
|
||||
arcgis = pl.DataFrame(
|
||||
{
|
||||
"pcds": ["AA1 1AA", "BB1 1BB"],
|
||||
"east1m": [500010, 510000],
|
||||
"north1m": [180010, 190000],
|
||||
"oa21cd": ["E00000001", "E00000002"],
|
||||
"doterm": [None, None],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
}
|
||||
)
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
arcgis.write_parquet(arcgis_path)
|
||||
|
||||
loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
|
||||
|
||||
assert set(loaded_df["PCDS"].to_list()) == {"AA1 1AA", "BB1 1BB"}
|
||||
points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
|
||||
assert postcodes == ["BB1 1BB"]
|
||||
assert points.tolist() == [[510000.0, 190000.0]]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bug 3: Voronoi deduplication is first-seen-wins
|
||||
|
|
@ -450,7 +517,9 @@ class TestProcessOAInspireParcelAssignment:
|
|||
)
|
||||
postcodes = ["A", "B"]
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
|
||||
fragments = process_oa(
|
||||
oa_geom, points, postcodes, inspire_candidates=[left, right]
|
||||
)
|
||||
frag_dict = dict(fragments)
|
||||
|
||||
assert "A" in frag_dict and "B" in frag_dict
|
||||
|
|
@ -494,7 +563,9 @@ class TestProcessOAInspireParcelAssignment:
|
|||
)
|
||||
postcodes = ["A", "B"]
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
|
||||
fragments = process_oa(
|
||||
oa_geom, points, postcodes, inspire_candidates=[left, right]
|
||||
)
|
||||
frag_dict = dict(fragments)
|
||||
|
||||
assert "A" in frag_dict and "B" in frag_dict
|
||||
|
|
@ -539,7 +610,9 @@ class TestProcessOAInspireParcelAssignment:
|
|||
)
|
||||
postcodes = ["A", "B"]
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[straddling])
|
||||
fragments = process_oa(
|
||||
oa_geom, points, postcodes, inspire_candidates=[straddling]
|
||||
)
|
||||
|
||||
for _, geom in fragments:
|
||||
assert geom.difference(oa_geom).area < 0.01
|
||||
|
|
@ -651,6 +724,22 @@ class TestToWgs84Geojson:
|
|||
assert lon_dp <= 6, f"Longitude {lon_s} has {lon_dp} decimal places"
|
||||
assert lat_dp <= 6, f"Latitude {lat_s} has {lat_dp} decimal places"
|
||||
|
||||
def test_write_district_geojson_replaces_stale_units(self, tmp_path):
|
||||
stale_units = tmp_path / "units"
|
||||
stale_units.mkdir()
|
||||
(stale_units / "ZZ1.geojson").write_text(
|
||||
json.dumps({"type": "FeatureCollection", "features": []})
|
||||
)
|
||||
|
||||
file_count = write_district_geojson(
|
||||
{"AA1 1AA": box(530000, 180000, 530100, 180100)}, tmp_path
|
||||
)
|
||||
|
||||
assert file_count == 1
|
||||
assert not (stale_units / "ZZ1.geojson").exists()
|
||||
written = json.loads((stale_units / "AA1.geojson").read_text())
|
||||
assert written["features"][0]["properties"]["postcodes"] == "AA1 1AA"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge case: parse_gpkg_geometry rejects unknown envelope types
|
||||
|
|
|
|||
|
|
@ -13,6 +13,33 @@ def _canonical_postcode_expr(name: str) -> pl.Expr:
|
|||
return pl.col(name).str.strip_chars().str.to_uppercase()
|
||||
|
||||
|
||||
def _active_english_arcgis_postcodes(arcgis_path: Path) -> pl.LazyFrame:
|
||||
return (
|
||||
pl.read_parquet(
|
||||
arcgis_path,
|
||||
columns=["pcds", "east1m", "north1m", "oa21cd", "ctry25cd", "doterm"],
|
||||
)
|
||||
.lazy()
|
||||
.filter(pl.col("ctry25cd") == "E92000001")
|
||||
.filter(pl.col("doterm").cast(pl.Utf8).is_null())
|
||||
.select(
|
||||
_canonical_postcode_expr("pcds").alias("PCDS"),
|
||||
pl.col("east1m").cast(pl.Float64).alias("GRIDGB1E"),
|
||||
pl.col("north1m").cast(pl.Float64).alias("GRIDGB1N"),
|
||||
pl.col("oa21cd").alias("OA21CD"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("PCDS").is_not_null()
|
||||
& (pl.col("PCDS") != "")
|
||||
& pl.col("GRIDGB1E").is_not_null()
|
||||
& pl.col("GRIDGB1N").is_not_null()
|
||||
& pl.col("OA21CD").is_not_null()
|
||||
& pl.col("OA21CD").str.starts_with("E")
|
||||
)
|
||||
.unique("PCDS")
|
||||
)
|
||||
|
||||
|
||||
def load_uprns(
|
||||
uprn_path: Path, arcgis_path: Path | None = None
|
||||
) -> tuple[pl.DataFrame, dict[str, tuple[int, int]]]:
|
||||
|
|
@ -25,6 +52,7 @@ def load_uprns(
|
|||
|
||||
print("Loading UPRN lookup...")
|
||||
mapping = None
|
||||
active_postcode_points = None
|
||||
if arcgis_path is not None:
|
||||
mapping = (
|
||||
build_postcode_mapping(arcgis_path)
|
||||
|
|
@ -34,6 +62,7 @@ def load_uprns(
|
|||
)
|
||||
.unique("old_postcode")
|
||||
)
|
||||
active_postcode_points = _active_english_arcgis_postcodes(arcgis_path)
|
||||
|
||||
# Sort via streaming sink to avoid polars doubling memory during in-memory sort
|
||||
with tempfile.NamedTemporaryFile(
|
||||
|
|
@ -51,11 +80,21 @@ def load_uprns(
|
|||
|
||||
if mapping is not None and mapping.height > 0:
|
||||
uprns = (
|
||||
uprns.join(mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left")
|
||||
uprns.join(
|
||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||
)
|
||||
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
)
|
||||
|
||||
if active_postcode_points is not None:
|
||||
active_postcodes = active_postcode_points.select("PCDS").unique()
|
||||
uprns = uprns.join(active_postcodes, on="PCDS", how="semi")
|
||||
missing_active = active_postcode_points.join(
|
||||
uprns.select("PCDS").unique(), on="PCDS", how="anti"
|
||||
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
uprns = pl.concat([uprns, missing_active], how="vertical_relaxed")
|
||||
|
||||
uprns.sort("OA21CD").sink_parquet(tmp_path)
|
||||
release_memory()
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue