idk
This commit is contained in:
parent
a04ac2d857
commit
d43da9708c
47 changed files with 4120 additions and 573 deletions
|
|
@ -95,11 +95,14 @@ def transform_crime(
|
|||
f"({valid_months[0]} to {valid_months[-1]})"
|
||||
)
|
||||
|
||||
# Count monthly incidents, then annualise over every valid month in the dataset.
|
||||
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
|
||||
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
|
||||
# don't know which child a given incident actually belonged to.
|
||||
yearly_counts = (
|
||||
# Annualise each year separately (count_in_year * 12 / months_in_year), then
|
||||
# take the simple mean of those per-year rates over the years each type is
|
||||
# present. This makes the headline equal the average of the by-year chart bars
|
||||
# (_write_crime_by_year) instead of a month-weighted pooled rate, mirroring
|
||||
# crime_spatial._write_avg_yr. `_weight` (≤1) comes from the LSOA 2011→2021
|
||||
# lookup: 2011 LSOAs that split into N 2021 LSOAs contribute 1/N of their count
|
||||
# to each child, since we don't know which child an incident actually belonged to.
|
||||
filtered = (
|
||||
df.filter(
|
||||
valid_month_expr
|
||||
& pl.col("LSOA code").is_not_null()
|
||||
|
|
@ -107,15 +110,31 @@ def transform_crime(
|
|||
& pl.col("Crime type").is_not_null()
|
||||
& (pl.col("Crime type") != "")
|
||||
)
|
||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
.group_by("LSOA code", "Month", "Crime type")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(
|
||||
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
|
||||
.round(1)
|
||||
.alias("yearly_avg")
|
||||
.with_columns(
|
||||
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
|
||||
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
|
||||
)
|
||||
)
|
||||
|
||||
# Months observed *anywhere* in the dataset for each year (annualisation
|
||||
# denominator), matching the by-year output's per-year scaling.
|
||||
months_per_year = filtered.group_by("year").agg(
|
||||
pl.col("Month").n_unique().alias("months_in_year")
|
||||
)
|
||||
|
||||
yearly_counts = (
|
||||
filtered.group_by("LSOA code", "year", "Crime type", "Month")
|
||||
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
||||
.group_by("LSOA code", "year", "Crime type")
|
||||
.agg(pl.col("count").sum().alias("count"))
|
||||
.join(months_per_year, on="year")
|
||||
.with_columns(
|
||||
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
|
||||
)
|
||||
# Mean of the per-year annualised rates over the years the type is present
|
||||
# (only years with rows are grouped here, so this is the correct x-span).
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(pl.col("per_year").mean().round(1).alias("yearly_avg"))
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
if yearly_counts.is_empty():
|
||||
|
|
|
|||
|
|
@ -259,11 +259,14 @@ def _write_avg_yr(
|
|||
"""
|
||||
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
|
||||
per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
|
||||
# Average over the years each type is actually observed anywhere -- the same
|
||||
# per-type x-span the by-year chart plots (server-rs/.../crime_by_year.rs).
|
||||
type_year_present = counts.sum(axis=0) > 0 # (n_types, n_years)
|
||||
years_per_type = np.clip(type_year_present.sum(axis=1), 1, None).astype(np.float64)
|
||||
avg = per_year.sum(axis=2) / years_per_type[None, :] # (n_postcodes, n_types)
|
||||
# Average over the years *this postcode* actually has incidents of *this
|
||||
# type* -- the same per-(postcode, type) x-span the by-year chart plots
|
||||
# (server-rs/.../crime_by_year.rs), so the headline equals the mean of the
|
||||
# by-year bars. Dividing by a global years-present count (years a type
|
||||
# appeared anywhere in England) would deflate postcodes whose incidents
|
||||
# cluster in only a few years of the ~13-year window.
|
||||
years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64)
|
||||
avg = per_year.sum(axis=2) / years_present # (n_postcodes, n_types)
|
||||
avg = np.round(avg * norm[:, None], 1).astype(np.float32)
|
||||
|
||||
data: dict[str, np.ndarray] = {"postcode": postcodes}
|
||||
|
|
|
|||
|
|
@ -365,6 +365,16 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
}
|
||||
duration_map = {"F": "Freehold", "L": "Leasehold"}
|
||||
|
||||
# price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
|
||||
# VALUE-QUALITY filters: they gate the price aggregations only. Category B
|
||||
# entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
|
||||
# sales must not pollute latest_price / historical_prices (and the downstream
|
||||
# price-per-sqm feature), but they MUST still count for first_transfer_date /
|
||||
# old_new so a new-build's genuine earliest transfer year is preserved.
|
||||
price_ok = pl.col("price") >= MIN_PRICE
|
||||
category_ok = pl.col("ppd_category") == "A"
|
||||
quality_ok = price_ok & category_ok
|
||||
|
||||
price_paid = (
|
||||
pl.scan_parquet(price_paid_path)
|
||||
.select(
|
||||
|
|
@ -381,9 +391,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
"town_city",
|
||||
pl.col("duration").replace(duration_map),
|
||||
"old_new",
|
||||
"ppd_category",
|
||||
)
|
||||
.filter(pl.col("pp_property_type") != "Other")
|
||||
.filter(pl.col("price") >= MIN_PRICE)
|
||||
.with_columns(
|
||||
pl.concat_str(
|
||||
[pl.col("saon"), pl.col("paon"), pl.col("street")],
|
||||
|
|
@ -408,18 +418,26 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
pl.col("postcode").last(),
|
||||
pl.col("_pp_match_address").last(),
|
||||
pl.col("_pp_match_postcode").last(),
|
||||
# Price aggregations are restricted to quality-passing sales.
|
||||
pl.struct(
|
||||
pl.col("date_of_transfer").dt.year().alias("year"),
|
||||
pl.col("date_of_transfer").dt.month().cast(pl.UInt8).alias("month"),
|
||||
"price",
|
||||
).alias("historical_prices"),
|
||||
)
|
||||
.filter(quality_ok)
|
||||
.alias("historical_prices"),
|
||||
pl.col("pp_property_type").last(),
|
||||
pl.col("duration").last(),
|
||||
pl.col("price").last().alias("latest_price"),
|
||||
pl.col("date_of_transfer").last(),
|
||||
pl.col("price").filter(quality_ok).last().alias("latest_price"),
|
||||
pl.col("date_of_transfer").filter(quality_ok).last(),
|
||||
# first_transfer_date / old_new reflect the genuine earliest transfer
|
||||
# over the full per-group transaction stream (not value-filtered).
|
||||
pl.col("date_of_transfer").first().alias("first_transfer_date"),
|
||||
pl.col("old_new").first(),
|
||||
)
|
||||
# Preserve the property universe: previously a property needed >=1 sale
|
||||
# >=MIN_PRICE to form a group, so drop groups with no quality-passing sale.
|
||||
.filter(pl.col("latest_price").is_not_null())
|
||||
)
|
||||
|
||||
print("Price paid dataset")
|
||||
|
|
|
|||
|
|
@ -839,25 +839,36 @@ def _join_area_side_tables(
|
|||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
base = base.join(crime, on="postcode", how="left")
|
||||
serious_crime_cols = [
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
]
|
||||
minor_crime_cols = [
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
]
|
||||
# The LEFT join leaves every per-type column null for postcodes absent from
|
||||
# the crime table; sum_horizontal alone would fabricate a "zero crime"
|
||||
# rollup there, so keep the rollup null when ALL components are null.
|
||||
base = base.with_columns(
|
||||
pl.sum_horizontal(
|
||||
"Violence and sexual offences (avg/yr)",
|
||||
"Robbery (avg/yr)",
|
||||
"Burglary (avg/yr)",
|
||||
"Possession of weapons (avg/yr)",
|
||||
).alias("serious_crime_avg_yr"),
|
||||
pl.sum_horizontal(
|
||||
"Anti-social behaviour (avg/yr)",
|
||||
"Criminal damage and arson (avg/yr)",
|
||||
"Shoplifting (avg/yr)",
|
||||
"Bicycle theft (avg/yr)",
|
||||
"Theft from the person (avg/yr)",
|
||||
"Other theft (avg/yr)",
|
||||
"Vehicle crime (avg/yr)",
|
||||
"Public order (avg/yr)",
|
||||
"Drugs (avg/yr)",
|
||||
"Other crime (avg/yr)",
|
||||
).alias("minor_crime_avg_yr"),
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in serious_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(serious_crime_cols))
|
||||
.alias("serious_crime_avg_yr"),
|
||||
pl.when(pl.all_horizontal([pl.col(c).is_null() for c in minor_crime_cols]))
|
||||
.then(None)
|
||||
.otherwise(pl.sum_horizontal(minor_crime_cols))
|
||||
.alias("minor_crime_avg_yr"),
|
||||
)
|
||||
|
||||
base = base.join(median_age, on="lsoa21", how="left")
|
||||
|
|
@ -1179,7 +1190,22 @@ def _load_listings_for_merge(listings_path: Path, arcgis_path: Path) -> pl.DataF
|
|||
# pages); tolerate its absence so older parquets and test fixtures still
|
||||
# load. Digits-only so it compares equal to the EPC register's UPRN.
|
||||
if "UPRN" in raw.collect_schema().names():
|
||||
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||
# Mirror `_normalize_uprn` exactly so the listing key compares equal to
|
||||
# the candidate-side key for every dtype. For a Float UPRN we must
|
||||
# stringify via its integer form (100023336956.0 -> "100023336956"),
|
||||
# otherwise stripping non-digits from "100023336956.0" yields a bogus
|
||||
# trailing-zero key ("1000233369560") that never collides; and a
|
||||
# non-integral float (e.g. 1.5) must be rejected rather than mangled.
|
||||
uprn_col = pl.col("UPRN")
|
||||
if raw.collect_schema()["UPRN"].is_float():
|
||||
integral = uprn_col.cast(pl.Int64, strict=False)
|
||||
uprn_digits = (
|
||||
pl.when(integral == uprn_col)
|
||||
.then(integral.cast(pl.Utf8).str.replace_all(r"\D", ""))
|
||||
.otherwise(None)
|
||||
)
|
||||
else:
|
||||
uprn_digits = uprn_col.cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||
listing_uprn_expr = (
|
||||
pl.when(uprn_digits.str.len_chars() > 0)
|
||||
.then(uprn_digits)
|
||||
|
|
@ -1615,9 +1641,23 @@ def _enrich_listings_with_direct_epc(
|
|||
|
||||
|
||||
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
|
||||
def _coalesced(raw_column: str, direct_column: str) -> pl.Expr:
|
||||
coalesce = pl.coalesce(pl.col(raw_column), pl.col(direct_column))
|
||||
# The raw property-level value is fill_null("No") upstream, so a plain
|
||||
# coalesce lets a non-null "No" override a directly-matched listing
|
||||
# "Yes". "Former council house" should fire if EITHER side says so.
|
||||
if raw_column == "was_council_house":
|
||||
return (
|
||||
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
|
||||
.then(pl.lit("Yes"))
|
||||
.otherwise(coalesce)
|
||||
.alias(raw_column)
|
||||
)
|
||||
return coalesce.alias(raw_column)
|
||||
|
||||
return wide.with_columns(
|
||||
[
|
||||
pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
|
||||
_coalesced(raw_column, direct_column)
|
||||
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -12,11 +12,19 @@ from pipeline.utils.poi_counts import count_pois_per_postcode, min_distance_per_
|
|||
|
||||
# POI category groups for proximity counting (2km radius).
|
||||
# Names must match the friendly names produced by transform_poi.py / naptan.py.
|
||||
# "groceries" is filled in dynamically by _groceries_categories() because the
|
||||
# GEOLYTIX dataset stores the brand (e.g. "Tesco", "Aldi") in `category` rather
|
||||
# than the literal "Supermarket"; counting only the OSM strings here severely
|
||||
# understates the metric. See _groceries_categories below.
|
||||
POI_GROUPS_2KM = {
|
||||
"restaurants": ["Restaurant", "Fast Food"],
|
||||
"groceries": ["Greengrocer", "Supermarket", "Convenience Store"],
|
||||
}
|
||||
|
||||
# POI group whose members are counted for the static "groceries" 2km metric.
|
||||
# Covers both the OSM grocery categories (Supermarket, Convenience Store,
|
||||
# Greengrocer, ...) and the GEOLYTIX brand categories (Tesco, Aldi, ...).
|
||||
GROCERIES_GROUP = "Groceries"
|
||||
|
||||
# OS Open Greenspace function types used for park counts and distance calculation.
|
||||
# Uses the authoritative OS dataset instead of OSM point POIs for better coverage
|
||||
# of green spaces that are only mapped as polygons in OSM.
|
||||
|
|
@ -41,6 +49,26 @@ def _poi_category_slug(category: str) -> str:
|
|||
return slug or "poi"
|
||||
|
||||
|
||||
def _groceries_categories(pois: pl.DataFrame) -> list[str]:
|
||||
"""Return the distinct `category` values for the Groceries group.
|
||||
|
||||
`count_pois_per_postcode` matches POIs on `category`, but the authoritative
|
||||
GEOLYTIX grocery dataset stores the brand name there (e.g. "Tesco", "Aldi")
|
||||
with group "Groceries"; it never emits the literal "Supermarket". Collecting
|
||||
every Groceries category captures both the OSM strings and the brand names.
|
||||
"""
|
||||
if "group" not in pois.columns:
|
||||
raise ValueError("POI dataframe must include a 'group' column")
|
||||
return (
|
||||
pois.filter(pl.col("group") == GROCERIES_GROUP)
|
||||
.select("category")
|
||||
.unique()
|
||||
.sort("category")
|
||||
.to_series()
|
||||
.to_list()
|
||||
)
|
||||
|
||||
|
||||
def _build_poi_category_groups(
|
||||
pois: pl.DataFrame,
|
||||
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
||||
|
|
@ -122,9 +150,15 @@ def main():
|
|||
pois = pl.read_parquet(args.pois)
|
||||
poi_category_groups, poi_display_names = _build_poi_category_groups(pois)
|
||||
|
||||
# Count static amenity groups within 2km.
|
||||
# Count static amenity groups within 2km. "groceries" is matched against
|
||||
# every Groceries category (OSM strings + GEOLYTIX brand names) so that
|
||||
# postcodes ringed by GEOLYTIX-only chains (Tesco, Aldi, ...) are counted.
|
||||
groups_2km = {
|
||||
**POI_GROUPS_2KM,
|
||||
"groceries": _groceries_categories(pois),
|
||||
}
|
||||
counts_2km = count_pois_per_postcode(
|
||||
postcodes, pois, groups=POI_GROUPS_2KM, radius_km=2
|
||||
postcodes, pois, groups=groups_2km, radius_km=2
|
||||
)
|
||||
|
||||
# Dynamic amenity filters: nearest distance plus counts within 2km and 5km for
|
||||
|
|
|
|||
|
|
@ -77,9 +77,9 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen
|
|||
|
||||
### Phase 4: Merging and writing
|
||||
|
||||
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
|
||||
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps the largest part **plus any other part ≥ `_MIN_DETACHED_PART_AREA` (100 m²)** (`_keep_polygon_parts`); only sub-100 m² noise slivers are dropped. Keeping substantial detached parts matters because a postcode genuinely split across an OA seam (by a railway, river, or main road wider than the 5m buffer) would otherwise lose a chunk — measured at ~1.8% of merged area left as uncovered gaps (often 3000–5000 m² building blocks) before this change.
|
||||
|
||||
**GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
|
||||
**GeoJSON output** (`output.py:write_district_geojson`): Two passes. Pass 1 converts every postcode from BNG to WGS84 (pyproj), simplifies with 1m tolerance (Douglas-Peucker), and snaps to 6 decimal places (~0.1m precision); multi-part postcodes become `MultiPolygon` (`to_wgs84_geojson_multi`, each part handled independently), single-part stay `Polygon`. The whole set is then made a **partition** (`_resolve_overlaps`): each postcode is trimmed by the union of its higher-priority overlapping neighbours, where **priority = ascending area** (smaller postcodes win contested ground). That single rule handles both seam overlap *and* containment — an enclosed postcode is always smaller than its container, so it keeps its area while the container gets a hole (the query uses both the `overlaps` and `contains` predicates, since `overlaps` alone excludes containment). This runs last, so nothing re-introduces overlap; a postcode that would be emptied keeps its original geometry, so no active postcode is dropped. Pass 2 groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`), rounds coordinates to 6dp, and writes a `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.
|
||||
|
||||
## Memory architecture
|
||||
|
||||
|
|
@ -103,10 +103,10 @@ Key design choices:
|
|||
|
||||
## Key invariants
|
||||
|
||||
1. **Every square meter of every OA is assigned to exactly one postcode** — the combination of INSPIRE claiming + Voronoi fills the entire OA, and overlap resolution ensures no double-counting
|
||||
1. **No two postcodes cover the same ground in the output** — within an OA the INSPIRE claiming + Voronoi tile it with no overlap, and a final `_resolve_overlaps` partition pass removes the thin overlap strips that the merge buffer + per-postcode simplification introduce across OA seams (measured residual overlap ~0.01% of area)
|
||||
2. **Every postcode that exists in the UPRN data gets a polygon** — unless all its UPRNs share coordinates with another postcode's UPRNs (handled by jitter) or it has zero UPRNs
|
||||
3. **Postcode polygons never extend outside their OA(s)** — all geometry is clipped to OA boundaries
|
||||
4. **Output is always single Polygon, never MultiPolygon** — the largest-polygon extraction in both `merge_fragments` and `to_wgs84_geojson` ensures this
|
||||
4. **A postcode split across an OA seam keeps all its substantial parts** — `merge_fragments` keeps every part ≥ 100 m² and the output is emitted as a `MultiPolygon` (the Rust server `postcodes.rs` and `loader.py` both parse MultiPolygon); only sub-100 m² noise slivers are dropped
|
||||
|
||||
## Module structure
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,21 @@
|
|||
import argparse
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import shapely
|
||||
from shapely.geometry import MultiPolygon, Polygon
|
||||
from tqdm import tqdm
|
||||
|
||||
from .fragments_cache import (
|
||||
fragments_cache_is_fresh,
|
||||
load_fragments,
|
||||
save_fragments,
|
||||
)
|
||||
from .inspire import (
|
||||
build_inspire_index,
|
||||
cache_inspire,
|
||||
get_inspire_candidates,
|
||||
inspire_cache_exists,
|
||||
load_inspire,
|
||||
)
|
||||
|
|
@ -14,7 +23,206 @@ from .memory import release_memory
|
|||
from .oa_boundaries import load_oa_boundaries
|
||||
from .output import merge_fragments, write_district_geojson
|
||||
from .process_oa import process_oa
|
||||
from .uprn import get_oa_uprns, load_uprns
|
||||
from .uprn import extract_uprn_arrays, get_oa_uprns_arrays, load_uprns
|
||||
|
||||
Fragment = tuple[str, Polygon | MultiPolygon]
|
||||
|
||||
|
||||
def _oa_fragments(
|
||||
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
|
||||
) -> tuple[list[Fragment], bool]:
|
||||
"""Process one OA into ``(postcode, geometry)`` fragments.
|
||||
|
||||
Returns ``(fragments, is_single)``; ``is_single`` flags the single-postcode
|
||||
fast path. Shared by the sequential and parallel drivers so both produce
|
||||
identical output. Any failure is re-raised tagged with the OA code so a single
|
||||
bad OA is attributable instead of an anonymous worker abort hours in.
|
||||
"""
|
||||
try:
|
||||
oa_geom = oa_geoms[oa_code]
|
||||
points, postcodes = get_oa_uprns_arrays(
|
||||
east, north, postcodes_arr, offsets, oa_code
|
||||
)
|
||||
if len(set(postcodes)) == 1:
|
||||
return [(postcodes[0], oa_geom)], True
|
||||
candidates = index.candidates(oa_geom.bounds)
|
||||
return process_oa(oa_geom, points, postcodes, candidates), False
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"Failed processing OA {oa_code}: {exc!r}") from exc
|
||||
|
||||
|
||||
# Worker-shared state. Populated in the parent before the pool forks; children
|
||||
# inherit it copy-on-write (the numpy/Arrow buffers + coords mmap stay shared,
|
||||
# never duplicated per worker). Read-only in workers.
|
||||
_WORKER_STATE: dict = {}
|
||||
|
||||
|
||||
def _process_oa_chunk(oa_codes: list[str]):
|
||||
"""Worker: turn a chunk of OA codes into WKB-encoded fragments.
|
||||
|
||||
Geometries are returned as WKB (compact and lossless) rather than pickled
|
||||
Shapely objects, to keep the IPC payload small.
|
||||
"""
|
||||
state = _WORKER_STATE
|
||||
frags: list[Fragment] = []
|
||||
single = 0
|
||||
for oa_code in oa_codes:
|
||||
oa_frags, is_single = _oa_fragments(
|
||||
oa_code,
|
||||
state["oa_geoms"],
|
||||
state["east"],
|
||||
state["north"],
|
||||
state["postcodes"],
|
||||
state["offsets"],
|
||||
state["index"],
|
||||
)
|
||||
frags.extend(oa_frags)
|
||||
single += is_single
|
||||
|
||||
if frags:
|
||||
pcs = [pc for pc, _ in frags]
|
||||
wkb = shapely.to_wkb(np.array([g for _, g in frags], dtype=object))
|
||||
else:
|
||||
pcs, wkb = [], np.empty(0, dtype=object)
|
||||
return pcs, wkb, single, len(oa_codes)
|
||||
|
||||
|
||||
def _resolve_workers(requested: int) -> int:
|
||||
"""Worker count: the explicit value if >0, otherwise all available CPUs."""
|
||||
if requested and requested > 0:
|
||||
return requested
|
||||
try:
|
||||
return max(1, len(os.sched_getaffinity(0)))
|
||||
except AttributeError:
|
||||
return max(1, os.cpu_count() or 1)
|
||||
|
||||
|
||||
def _process_oas(
|
||||
oa_codes, oa_geoms, east, north, postcodes_arr, offsets, index, workers
|
||||
) -> tuple[list[Fragment], int]:
|
||||
"""Drive Phase 3 over every OA, fanning out across `workers` processes.
|
||||
|
||||
OAs are independent, so the loop parallelises cleanly. ``fork`` lets workers
|
||||
share the big read-only inputs (INSPIRE arrays + coords mmap, UPRN arrays, OA
|
||||
geometries) copy-on-write instead of duplicating ~2GB each. Fragment order
|
||||
does not affect the result (``merge_fragments`` unions per postcode), so
|
||||
chunks are collected as they finish. Returns ``(fragments, single_count)``.
|
||||
"""
|
||||
all_fragments: list[Fragment] = []
|
||||
single_count = 0
|
||||
|
||||
if workers <= 1 or "fork" not in mp.get_all_start_methods():
|
||||
for oa_code in tqdm(
|
||||
oa_codes, desc="Processing OAs", unit="OA", smoothing=0.01, miniters=100
|
||||
):
|
||||
oa_frags, is_single = _oa_fragments(
|
||||
oa_code, oa_geoms, east, north, postcodes_arr, offsets, index
|
||||
)
|
||||
all_fragments.extend(oa_frags)
|
||||
single_count += is_single
|
||||
return all_fragments, single_count
|
||||
|
||||
_WORKER_STATE.update(
|
||||
oa_geoms=oa_geoms,
|
||||
east=east,
|
||||
north=north,
|
||||
postcodes=postcodes_arr,
|
||||
offsets=offsets,
|
||||
index=index,
|
||||
)
|
||||
# Many small contiguous chunks → dynamic load balancing across workers (rural
|
||||
# OAs cost far more than urban ones) while preserving mmap read locality.
|
||||
chunk_size = max(1, len(oa_codes) // (workers * 16))
|
||||
chunks = [oa_codes[i : i + chunk_size] for i in range(0, len(oa_codes), chunk_size)]
|
||||
print(f" Parallel: {workers} workers, {len(chunks)} chunks of ~{chunk_size} OAs")
|
||||
|
||||
ctx = mp.get_context("fork")
|
||||
try:
|
||||
with ctx.Pool(processes=workers) as pool:
|
||||
with tqdm(
|
||||
total=len(oa_codes), desc="Processing OAs", unit="OA", smoothing=0.01
|
||||
) as bar:
|
||||
for pcs, wkb, single, n_oas in pool.imap_unordered(
|
||||
_process_oa_chunk, chunks
|
||||
):
|
||||
if len(wkb):
|
||||
all_fragments.extend(zip(pcs, shapely.from_wkb(wkb)))
|
||||
single_count += single
|
||||
bar.update(n_oas)
|
||||
finally:
|
||||
# Drop references so Phase 4 doesn't keep the big inputs alive.
|
||||
_WORKER_STATE.clear()
|
||||
return all_fragments, single_count
|
||||
|
||||
|
||||
def build_fragments(args: argparse.Namespace) -> list[Fragment]:
|
||||
"""Run Phases 1-3: load data, parse INSPIRE, process every OA into fragments.
|
||||
|
||||
Returns the full ``(postcode, geometry)`` fragment list. The large
|
||||
intermediate structures (OA/UPRN/INSPIRE arrays) are locals here, so they are
|
||||
freed as soon as this function returns -- before the fragments are cached or
|
||||
merged.
|
||||
"""
|
||||
# Phase 1: Load all data
|
||||
print("=" * 60)
|
||||
print("Phase 1: Loading data")
|
||||
print("=" * 60)
|
||||
|
||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
||||
# Convert UPRNs to fork-shareable numpy/Arrow arrays so parallel workers never
|
||||
# call polars (avoids the fork-after-threads hazard of its rayon pool).
|
||||
uprn_east, uprn_north, uprn_postcodes = extract_uprn_arrays(uprn_df)
|
||||
|
||||
# Phase 2: Parse/load INSPIRE
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 2: INSPIRE data")
|
||||
print("=" * 60)
|
||||
|
||||
inspire_cache_dir = args.output / "inspire_cache"
|
||||
if not inspire_cache_exists(inspire_cache_dir):
|
||||
cache_inspire(args.inspire, inspire_cache_dir)
|
||||
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
||||
inspire_index = build_inspire_index(inspire_bboxes, inspire_offsets, inspire_coords)
|
||||
|
||||
# Phase 3: Process OAs
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 3: Processing OAs")
|
||||
print("=" * 60)
|
||||
|
||||
# Build work list — precompute which OAs are single vs multi-postcode
|
||||
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
||||
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
||||
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
||||
|
||||
if args.limit > 0:
|
||||
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
||||
|
||||
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
||||
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
||||
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
||||
|
||||
# --limit is a debug mode → force deterministic single-process.
|
||||
workers = 1 if args.limit > 0 else _resolve_workers(args.workers)
|
||||
all_fragments, single_count = _process_oas(
|
||||
oa_codes_with_data,
|
||||
oa_geoms,
|
||||
uprn_east,
|
||||
uprn_north,
|
||||
uprn_postcodes,
|
||||
uprn_offsets,
|
||||
inspire_index,
|
||||
workers,
|
||||
)
|
||||
multi_count = len(oa_codes_with_data) - single_count
|
||||
|
||||
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
||||
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
||||
print(f" Total fragments: {len(all_fragments)}")
|
||||
|
||||
return all_fragments
|
||||
|
||||
|
||||
def main() -> None:
|
||||
|
|
@ -38,6 +246,12 @@ def main() -> None:
|
|||
parser.add_argument(
|
||||
"--limit", type=int, default=0, help="Process only first N OAs (0=all)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--workers",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Parallel worker processes for OA processing (0=all CPUs, 1=sequential)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--greenspace",
|
||||
type=Path,
|
||||
|
|
@ -46,79 +260,30 @@ def main() -> None:
|
|||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Phase 1: Load all data
|
||||
print("=" * 60)
|
||||
print("Phase 1: Loading data")
|
||||
print("=" * 60)
|
||||
fragments_cache = args.output / "fragments_cache.parquet"
|
||||
# Phase 3 depends only on these inputs; greenspace is applied later (Phase 4),
|
||||
# so a greenspace change must not invalidate the fragment cache.
|
||||
fragment_inputs = [args.uprn, args.arcgis, args.oa_boundaries, args.inspire]
|
||||
# --limit yields a partial fragment set; never read or write the shared cache.
|
||||
use_cache = args.limit == 0
|
||||
|
||||
oa_geoms = load_oa_boundaries(args.oa_boundaries)
|
||||
uprn_df, uprn_offsets = load_uprns(args.uprn, args.arcgis)
|
||||
|
||||
# Phase 2: Parse/load INSPIRE
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 2: INSPIRE data")
|
||||
print("=" * 60)
|
||||
|
||||
inspire_cache_dir = args.output / "inspire_cache"
|
||||
if not inspire_cache_exists(inspire_cache_dir):
|
||||
cache_inspire(args.inspire, inspire_cache_dir)
|
||||
inspire_bboxes, inspire_offsets, inspire_coords = load_inspire(inspire_cache_dir)
|
||||
|
||||
# Phase 3: Process OAs
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Phase 3: Processing OAs")
|
||||
print("=" * 60)
|
||||
|
||||
# Build work list — precompute which OAs are single vs multi-postcode
|
||||
oa_codes_with_data = sorted(set(oa_geoms.keys()) & set(uprn_offsets.keys()))
|
||||
skipped_no_uprn = len(oa_geoms) - len(oa_codes_with_data)
|
||||
skipped_no_boundary = len(uprn_offsets) - len(oa_codes_with_data)
|
||||
|
||||
if args.limit > 0:
|
||||
oa_codes_with_data = oa_codes_with_data[: args.limit]
|
||||
|
||||
print(f" OAs with UPRNs + boundaries: {len(oa_codes_with_data)}")
|
||||
print(f" Skipped (no UPRNs): {skipped_no_uprn}")
|
||||
print(f" Skipped (no boundary): {skipped_no_boundary}")
|
||||
|
||||
all_fragments: list[tuple[str, Polygon | MultiPolygon]] = []
|
||||
single_count = 0
|
||||
multi_count = 0
|
||||
|
||||
for oa_code in tqdm(
|
||||
oa_codes_with_data,
|
||||
desc="Processing OAs",
|
||||
unit="OA",
|
||||
smoothing=0.01,
|
||||
miniters=100,
|
||||
):
|
||||
oa_geom = oa_geoms[oa_code]
|
||||
points, postcodes = get_oa_uprns(uprn_df, uprn_offsets, oa_code)
|
||||
|
||||
if len(set(postcodes)) == 1:
|
||||
# Fast path: entire OA = one postcode
|
||||
all_fragments.append((postcodes[0], oa_geom))
|
||||
single_count += 1
|
||||
continue
|
||||
|
||||
# Get INSPIRE candidates via bbox pre-filter
|
||||
candidates = get_inspire_candidates(
|
||||
oa_geom.bounds, inspire_bboxes, inspire_offsets, inspire_coords
|
||||
if use_cache and fragments_cache_is_fresh(fragments_cache, fragment_inputs):
|
||||
print("=" * 60)
|
||||
print("Phase 3 cache hit — loading fragments (skipping Phases 1-3)")
|
||||
print("=" * 60)
|
||||
all_fragments = load_fragments(fragments_cache)
|
||||
print(
|
||||
f" Loaded {len(all_fragments):,} cached fragments from {fragments_cache}"
|
||||
)
|
||||
else:
|
||||
all_fragments = build_fragments(args)
|
||||
if use_cache:
|
||||
# Persist the expensive Phase-3 output before the cheap-but-fragile
|
||||
# merge/write so any failure there resumes in seconds, not ~10 hours.
|
||||
save_fragments(fragments_cache, all_fragments)
|
||||
print(f" Cached {len(all_fragments):,} fragments to {fragments_cache}")
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, candidates)
|
||||
all_fragments.extend(fragments)
|
||||
multi_count += 1
|
||||
|
||||
print(f"\n Single-postcode OAs (fast path): {single_count}")
|
||||
print(f" Multi-postcode OAs (INSPIRE+Voronoi): {multi_count}")
|
||||
print(f" Total fragments: {len(all_fragments)}")
|
||||
|
||||
# Free data no longer needed
|
||||
del oa_geoms, uprn_df, uprn_offsets
|
||||
del inspire_bboxes, inspire_offsets, inspire_coords
|
||||
# Free Phase-1-3 intermediates (build_fragments' locals) back to the OS.
|
||||
release_memory()
|
||||
|
||||
# Phase 4: Merge and write
|
||||
|
|
@ -145,6 +310,12 @@ def main() -> None:
|
|||
|
||||
file_count = write_district_geojson(merged, args.output)
|
||||
print(f"\n Wrote {file_count} district GeoJSON files to {args.output / 'units'}")
|
||||
|
||||
# The cache exists only to survive a crash between Phase 3 and a clean write.
|
||||
# Now that the output is complete, drop it so a later input change can never
|
||||
# be served from a stale cache.
|
||||
if use_cache:
|
||||
fragments_cache.unlink(missing_ok=True)
|
||||
print("Done!")
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -112,44 +112,130 @@ def load_inspire(
|
|||
return bboxes, offsets, coords_mmap
|
||||
|
||||
|
||||
def get_inspire_candidates(
|
||||
oa_bounds: tuple[float, float, float, float],
|
||||
# Grid cell size (m) for the parcel spatial index. The median parcel is ~25 m
|
||||
# and the 99th percentile ~540 m, so almost every parcel fits inside a single
|
||||
# 1 km cell; the ~0.4% larger than a cell go to an overflow list tested on every
|
||||
# query.
|
||||
_GRID_CELL_SIZE = 1000.0
|
||||
|
||||
|
||||
class InspireIndex:
|
||||
"""Uniform-grid spatial index over INSPIRE parcel bounding boxes.
|
||||
|
||||
The per-OA candidate lookup used to linear-scan all ~24M bboxes (O(N) per
|
||||
OA, ~4 h total over the country). This indexes parcels by grid cell so each
|
||||
lookup is O(cells_spanned + candidates). Parcels no larger than one cell are
|
||||
bucketed by their bbox min-corner cell in a CSR layout (parcel indices sorted
|
||||
by cell id, located with ``searchsorted``); the few parcels larger than a
|
||||
cell are kept in an overflow array tested directly on every query. An exact
|
||||
bbox test then runs on the gathered subset and the result is sorted, so the
|
||||
candidate set -- and its order -- is byte-for-byte identical to the old scan.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bboxes: np.ndarray,
|
||||
offsets: np.ndarray,
|
||||
coords_mmap: np.memmap,
|
||||
cell_size: float = _GRID_CELL_SIZE,
|
||||
) -> None:
|
||||
self._bboxes = bboxes
|
||||
self._offsets = offsets
|
||||
self._coords = coords_mmap
|
||||
self._cell_size = cell_size
|
||||
self._origin_x = float(bboxes[:, 0].min())
|
||||
self._origin_y = float(bboxes[:, 1].min())
|
||||
# Flattened cell id is ``cx * _ny + cy``; +2 leaves a guard row so the
|
||||
# query's one-cell low-edge widening can never collide with cx-1.
|
||||
self._ny = int((bboxes[:, 1].max() - self._origin_y) // cell_size) + 2
|
||||
|
||||
width = bboxes[:, 2] - bboxes[:, 0]
|
||||
height = bboxes[:, 3] - bboxes[:, 1]
|
||||
small = np.where((width <= cell_size) & (height <= cell_size))[0]
|
||||
self._oversized = np.where((width > cell_size) | (height > cell_size))[0]
|
||||
self._oversized_bb = bboxes[self._oversized]
|
||||
|
||||
cx = ((bboxes[small, 0] - self._origin_x) // cell_size).astype(np.int64)
|
||||
cy = ((bboxes[small, 1] - self._origin_y) // cell_size).astype(np.int64)
|
||||
cell_id = cx * self._ny + cy
|
||||
order = np.argsort(cell_id, kind="stable")
|
||||
self._sorted_cells = cell_id[order]
|
||||
self._cell_parcels = small[order]
|
||||
|
||||
def candidate_indices(self, oa_bounds: tuple[float, float, float, float]) -> np.ndarray:
|
||||
"""Parcel indices whose bbox overlaps ``oa_bounds`` (ascending order)."""
|
||||
min_e, min_n, max_e, max_n = oa_bounds
|
||||
cs = self._cell_size
|
||||
# A small parcel (<= one cell) overlapping the OA has its min-corner no
|
||||
# more than one cell below/left of the OA bbox, so widen the low edges by
|
||||
# a cell. This keeps the lookup free of false negatives.
|
||||
gx0 = int((min_e - cs - self._origin_x) // cs)
|
||||
gx1 = int((max_e - self._origin_x) // cs)
|
||||
gy_lo = int((min_n - cs - self._origin_y) // cs)
|
||||
gy_hi = int((max_n - self._origin_y) // cs)
|
||||
|
||||
parts = []
|
||||
ob = self._oversized_bb
|
||||
if len(ob):
|
||||
mo = (
|
||||
(ob[:, 2] >= min_e)
|
||||
& (ob[:, 0] <= max_e)
|
||||
& (ob[:, 3] >= min_n)
|
||||
& (ob[:, 1] <= max_n)
|
||||
)
|
||||
if mo.any():
|
||||
parts.append(self._oversized[mo])
|
||||
|
||||
for gx in range(gx0, gx1 + 1):
|
||||
base = gx * self._ny
|
||||
lo = np.searchsorted(self._sorted_cells, base + gy_lo, "left")
|
||||
hi = np.searchsorted(self._sorted_cells, base + gy_hi, "right")
|
||||
if hi > lo:
|
||||
parts.append(self._cell_parcels[lo:hi])
|
||||
|
||||
if not parts:
|
||||
return np.empty(0, dtype=np.int64)
|
||||
cand = np.concatenate(parts)
|
||||
cb = self._bboxes[cand]
|
||||
mask = (
|
||||
(cb[:, 2] >= min_e)
|
||||
& (cb[:, 0] <= max_e)
|
||||
& (cb[:, 3] >= min_n)
|
||||
& (cb[:, 1] <= max_n)
|
||||
)
|
||||
# Sort so the candidate order matches the old full np.where scan exactly.
|
||||
return np.sort(cand[mask])
|
||||
|
||||
def candidates(
|
||||
self, oa_bounds: tuple[float, float, float, float]
|
||||
) -> list[Polygon]:
|
||||
"""INSPIRE polygons overlapping an OA, built from the mmap on demand.
|
||||
|
||||
Builds Shapely objects only for matches (typically 10-500 per OA).
|
||||
"""
|
||||
candidates = []
|
||||
for i in self.candidate_indices(oa_bounds):
|
||||
byte_offset = self._offsets[i, 0]
|
||||
n_pts = self._offsets[i, 1]
|
||||
float_offset = byte_offset // 8 # float64 = 8 bytes
|
||||
coords = self._coords[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
||||
poly = Polygon(coords)
|
||||
if not poly.is_valid:
|
||||
poly = make_valid(poly)
|
||||
if poly.geom_type == "MultiPolygon":
|
||||
poly = max(poly.geoms, key=lambda g: g.area)
|
||||
elif poly.geom_type != "Polygon":
|
||||
continue
|
||||
if not poly.is_empty:
|
||||
candidates.append(poly)
|
||||
return candidates
|
||||
|
||||
|
||||
def build_inspire_index(
|
||||
bboxes: np.ndarray,
|
||||
offsets: np.ndarray,
|
||||
coords_mmap: np.memmap,
|
||||
) -> list[Polygon]:
|
||||
"""Get INSPIRE polygons overlapping an OA via bbox pre-filter.
|
||||
|
||||
Builds Shapely objects only for matches (typically 10-500 per OA).
|
||||
Reads coordinate data on-demand from memory-mapped file.
|
||||
"""
|
||||
min_e, min_n, max_e, max_n = oa_bounds
|
||||
|
||||
# Vectorized bbox overlap test
|
||||
mask = (
|
||||
(bboxes[:, 2] >= min_e)
|
||||
& (bboxes[:, 0] <= max_e)
|
||||
& (bboxes[:, 3] >= min_n)
|
||||
& (bboxes[:, 1] <= max_n)
|
||||
)
|
||||
idxs = np.where(mask)[0]
|
||||
if len(idxs) == 0:
|
||||
return []
|
||||
|
||||
# Build Shapely polygons only for candidates (coords from mmap)
|
||||
candidates = []
|
||||
for i in idxs:
|
||||
byte_offset = offsets[i, 0]
|
||||
n_pts = offsets[i, 1]
|
||||
float_offset = byte_offset // 8 # float64 = 8 bytes
|
||||
coords = coords_mmap[float_offset : float_offset + n_pts * 2].reshape(-1, 2)
|
||||
poly = Polygon(coords)
|
||||
if not poly.is_valid:
|
||||
poly = make_valid(poly)
|
||||
if poly.geom_type == "MultiPolygon":
|
||||
poly = max(poly.geoms, key=lambda g: g.area)
|
||||
elif poly.geom_type != "Polygon":
|
||||
continue
|
||||
if not poly.is_empty:
|
||||
candidates.append(poly)
|
||||
return candidates
|
||||
cell_size: float = _GRID_CELL_SIZE,
|
||||
) -> InspireIndex:
|
||||
"""Build the grid spatial index used for per-OA candidate retrieval."""
|
||||
return InspireIndex(bboxes, offsets, coords_mmap, cell_size)
|
||||
|
|
|
|||
|
|
@ -3,8 +3,9 @@ import shutil
|
|||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from pyproj import Transformer
|
||||
from shapely import make_valid, set_precision
|
||||
from shapely import STRtree, make_valid, set_precision
|
||||
from shapely.errors import GEOSException
|
||||
from shapely.geometry import MultiPolygon, Polygon, mapping, shape
|
||||
from shapely.ops import transform as transform_geometry
|
||||
|
|
@ -41,30 +42,30 @@ def _largest_polygonal(geom) -> Polygon | None:
|
|||
return None
|
||||
|
||||
|
||||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
# Output coordinate grid (~0.11 m at UK latitudes). Polygons whose extent is
|
||||
# below this in any direction snap to empty during serialization.
|
||||
_OUTPUT_PRECISION_DEG = 0.000001
|
||||
# Minimal BNG buffer used to rescue sub-grid slivers into a representable
|
||||
# footprint. A near-zero-area Voronoi/INSPIRE spike (e.g. three almost-collinear
|
||||
# vertices) would otherwise vanish at output precision; since every *active*
|
||||
# postcode must keep a boundary (validate_outputs enforces this with zero
|
||||
# tolerance), we fatten it just enough to survive snapping rather than drop it.
|
||||
_MIN_FOOTPRINT_BUFFER_M = 0.5
|
||||
|
||||
|
||||
def _snap_to_wgs84_geojson(geom_bng: Polygon | MultiPolygon) -> dict | None:
|
||||
"""Transform a BNG polygon to WGS84, snap to output precision, validate.
|
||||
|
||||
Validates the *serialized* GeoJSON dict (via a ``shape()`` round-trip), not
|
||||
just the intermediate Shapely object: coordinate snapping during
|
||||
serialization can otherwise leave a self-intersecting ring that only shows up
|
||||
once the feature is read back from disk. Any such geometry is repaired with
|
||||
``make_valid`` before returning so written features are always valid.
|
||||
once the feature is read back from disk. Returns ``None`` if the geometry
|
||||
collapses to empty (a sub-grid sliver).
|
||||
"""
|
||||
geom = _largest_polygonal(geom)
|
||||
if geom is None:
|
||||
return None
|
||||
|
||||
simplified = geom.simplify(tolerance, preserve_topology=True)
|
||||
simplified = _largest_polygonal(simplified)
|
||||
if simplified is None:
|
||||
return None
|
||||
|
||||
transformer = _get_to_wgs84()
|
||||
wgs84 = transform_geometry(transformer.transform, simplified)
|
||||
wgs84 = transform_geometry(transformer.transform, geom_bng)
|
||||
try:
|
||||
wgs84 = set_precision(wgs84, 0.000001, mode="valid_output")
|
||||
wgs84 = set_precision(wgs84, _OUTPUT_PRECISION_DEG, mode="valid_output")
|
||||
except GEOSException:
|
||||
# Precision snapping can fail on pathological geometries; fall back to a
|
||||
# plain validity repair without coordinate snapping.
|
||||
|
|
@ -87,20 +88,105 @@ def to_wgs84_geojson(
|
|||
return geojson_dict
|
||||
|
||||
|
||||
def _rescue_footprint(geom_bng) -> dict | None:
|
||||
"""Fatten a degenerate BNG geometry into a representable footprint and snap."""
|
||||
footprint = _largest_polygonal(geom_bng.buffer(_MIN_FOOTPRINT_BUFFER_M))
|
||||
if footprint is None:
|
||||
return None
|
||||
return _snap_to_wgs84_geojson(footprint)
|
||||
|
||||
|
||||
def to_wgs84_geojson(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Simplify geometry in BNG, convert to WGS84, return a valid GeoJSON dict.
|
||||
|
||||
A few thousand postcodes reduce to a sub-grid sliver that snaps to empty at
|
||||
output precision. Dropping them would leave an active postcode with no
|
||||
boundary (validate_outputs rejects that with zero tolerance), so instead they
|
||||
are fattened into a minimal footprint at the right location: first by buffering
|
||||
the (often elongated) sliver itself, then -- for fully-degenerate input -- a
|
||||
small disc around ``representative_point()``, which lies inside any non-empty
|
||||
geometry. ``None`` is returned only for a genuinely empty input.
|
||||
"""
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
|
||||
cleaned = _largest_polygonal(geom)
|
||||
if cleaned is not None:
|
||||
simplified = _largest_polygonal(
|
||||
cleaned.simplify(tolerance, preserve_topology=True)
|
||||
)
|
||||
if simplified is None:
|
||||
simplified = cleaned
|
||||
# Normal path; if snapping erases a thin sliver, fatten its real shape.
|
||||
result = _snap_to_wgs84_geojson(simplified)
|
||||
if result is None:
|
||||
result = _rescue_footprint(simplified)
|
||||
if result is not None:
|
||||
return result
|
||||
|
||||
# Universal fallback for input too degenerate to clean or fatten in place.
|
||||
return _rescue_footprint(geom.representative_point())
|
||||
|
||||
|
||||
def to_wgs84_geojson_multi(
|
||||
geom: Polygon | MultiPolygon, tolerance: float = 1.0
|
||||
) -> dict | None:
|
||||
"""Convert a (possibly multi-part) postcode geometry to a GeoJSON dict,
|
||||
preserving every part. Each part is simplified/snapped/rescued independently
|
||||
via :func:`to_wgs84_geojson`; the result is a ``Polygon`` for a single part or
|
||||
a ``MultiPolygon`` for several. ``None`` only if every part is degenerate.
|
||||
"""
|
||||
parts = list(geom.geoms) if geom.geom_type == "MultiPolygon" else [geom]
|
||||
part_dicts = [d for part in parts if (d := to_wgs84_geojson(part, tolerance))]
|
||||
if not part_dicts:
|
||||
return None
|
||||
if len(part_dicts) == 1:
|
||||
return part_dicts[0]
|
||||
return {
|
||||
"type": "MultiPolygon",
|
||||
"coordinates": [pd["coordinates"] for pd in part_dicts],
|
||||
}
|
||||
|
||||
|
||||
# Interior holes from the INSPIRE+Voronoi+make_valid chain are small artifacts and
|
||||
# get filled. A hole at least this large is likely a genuinely enclosed postcode
|
||||
# (kept, so we never solidify over a neighbour); the de-overlap pass is the real
|
||||
# guarantee, this is defence-in-depth.
|
||||
_MAX_ARTIFACT_HOLE_AREA = 1000.0
|
||||
|
||||
|
||||
def _fill_small_holes(poly: Polygon) -> Polygon:
|
||||
kept = [r for r in poly.interiors if Polygon(r).area >= _MAX_ARTIFACT_HOLE_AREA]
|
||||
return Polygon(poly.exterior, kept)
|
||||
|
||||
|
||||
def _fill_holes(geom):
|
||||
"""Remove all interior rings (holes) from a polygon or multipolygon."""
|
||||
"""Fill small artifact interior rings; keep large (real-enclosed) holes."""
|
||||
if geom.geom_type == "Polygon":
|
||||
return Polygon(geom.exterior)
|
||||
return _fill_small_holes(geom)
|
||||
elif geom.geom_type == "MultiPolygon":
|
||||
return MultiPolygon([Polygon(p.exterior) for p in geom.geoms])
|
||||
return MultiPolygon([_fill_small_holes(p) for p in geom.geoms])
|
||||
return geom
|
||||
|
||||
|
||||
def _largest_polygon(geom):
|
||||
"""Extract the largest polygon from a MultiPolygon."""
|
||||
if geom.geom_type == "MultiPolygon":
|
||||
return max(geom.geoms, key=lambda g: g.area)
|
||||
return geom
|
||||
# A postcode genuinely split across an OA seam (by a railway, river, or main road
|
||||
# wider than the merge buffer) arrives here as a MultiPolygon. Keeping only the
|
||||
# largest part used to discard the rest, leaving ~1.8% of merged area as uncovered
|
||||
# gaps (often 3000-5000 m² building blocks). Keep every part at least this big;
|
||||
# smaller detached bits are Voronoi/clipping noise and are still dropped.
|
||||
_MIN_DETACHED_PART_AREA = 100.0
|
||||
|
||||
|
||||
def _keep_polygon_parts(geom):
|
||||
"""Keep all MultiPolygon parts >= _MIN_DETACHED_PART_AREA (largest if none)."""
|
||||
if geom.geom_type != "MultiPolygon":
|
||||
return geom
|
||||
parts = [g for g in geom.geoms if g.area >= _MIN_DETACHED_PART_AREA]
|
||||
if not parts:
|
||||
parts = [max(geom.geoms, key=lambda g: g.area)]
|
||||
return parts[0] if len(parts) == 1 else MultiPolygon(parts)
|
||||
|
||||
|
||||
def merge_fragments(
|
||||
|
|
@ -126,14 +212,19 @@ def merge_fragments(
|
|||
continue
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches)
|
||||
# Close tiny gaps between adjacent OA boundary edges (float mismatches).
|
||||
# The closing can erode a tiny MultiPolygon (e.g. a postcode with only a
|
||||
# sliver fragment) to nothing, which would leave the postcode with no
|
||||
# geometry at all — keep the un-closed shape if that happens.
|
||||
if combined.geom_type == "MultiPolygon":
|
||||
combined = combined.buffer(5.0).buffer(-5.0)
|
||||
if not combined.is_valid:
|
||||
combined = make_valid(combined)
|
||||
# Postcodes are contiguous delivery routes — keep only the largest
|
||||
# polygon; small detached fragments are algorithm artifacts
|
||||
combined = _largest_polygon(combined)
|
||||
closed = combined.buffer(5.0).buffer(-5.0)
|
||||
if not closed.is_valid:
|
||||
closed = make_valid(closed)
|
||||
if not closed.is_empty:
|
||||
combined = closed
|
||||
# Keep the postcode whole: the largest part plus any other substantial
|
||||
# part (a genuine railway/river split), dropping only tiny noise slivers.
|
||||
combined = _keep_polygon_parts(combined)
|
||||
# Remove artifact interior holes from INSPIRE+Voronoi+make_valid chain
|
||||
combined = _fill_holes(combined)
|
||||
# Subtract parks/water if provided
|
||||
|
|
@ -142,7 +233,7 @@ def merge_fragments(
|
|||
|
||||
pre_green = combined
|
||||
combined = subtract_greenspace(combined, greenspace_tree, greenspace_geoms)
|
||||
combined = _largest_polygon(combined)
|
||||
combined = _keep_polygon_parts(combined)
|
||||
# Do NOT _fill_holes here: interior holes carved by the greenspace
|
||||
# subtraction (lakes, enclosed parks) are intentional, not artifacts.
|
||||
# Filling them would re-add the removed area and negate the
|
||||
|
|
@ -155,10 +246,114 @@ def merge_fragments(
|
|||
return merged
|
||||
|
||||
|
||||
def _polygonal(geom):
|
||||
"""Return only the polygonal part(s) of a geometry, or None if none remain."""
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
if geom.geom_type in ("Polygon", "MultiPolygon"):
|
||||
return geom
|
||||
if geom.geom_type == "GeometryCollection":
|
||||
polys = [
|
||||
g
|
||||
for g in geom.geoms
|
||||
if g.geom_type in ("Polygon", "MultiPolygon") and not g.is_empty
|
||||
]
|
||||
if not polys:
|
||||
return None
|
||||
merged = unary_union(polys)
|
||||
return merged if not merged.is_empty else None
|
||||
return None
|
||||
|
||||
|
||||
def _resolve_overlaps(
|
||||
items: list[tuple[str, Polygon | MultiPolygon]],
|
||||
) -> list[tuple[str, Polygon | MultiPolygon]]:
|
||||
"""Make the postcode polygons a partition: no two cover the same ground.
|
||||
|
||||
Overlap appears at OA seams (the 5m merge buffer expands each postcode
|
||||
independently), from simplifying each postcode on its own, and as genuine
|
||||
containment (a postcode fully enclosed by another). Each postcode is trimmed
|
||||
by the union of its higher-priority overlapping neighbours, where **priority =
|
||||
ascending area**: a smaller postcode wins contested ground. That single rule
|
||||
handles both cases correctly — an enclosed postcode is always smaller than its
|
||||
container, so it keeps its area while the container gets a hole (a `overlaps`
|
||||
query alone would miss containment entirely). Run last, on the final output
|
||||
geometries, so nothing re-introduces overlap afterwards. A postcode that would
|
||||
be emptied keeps its original geometry, so an active postcode is never dropped.
|
||||
"""
|
||||
geoms = [g for _, g in items]
|
||||
n = len(geoms)
|
||||
if n < 2:
|
||||
return items
|
||||
|
||||
# rank[i]: 0 = highest priority (smallest area). Postcode string breaks ties
|
||||
# for determinism.
|
||||
rank = {
|
||||
idx: r
|
||||
for r, idx in enumerate(
|
||||
sorted(range(n), key=lambda i: (geoms[i].area, items[i][0]))
|
||||
)
|
||||
}
|
||||
|
||||
tree = STRtree(geoms)
|
||||
arr = np.array(geoms, dtype=object)
|
||||
pairs: set[tuple[int, int]] = set()
|
||||
# "overlaps" gives partial overlaps; "contains" gives containment (which
|
||||
# "overlaps" excludes) — together they cover every 2-D overlap without the
|
||||
# edge-touch explosion a plain "intersects" query would add.
|
||||
for predicate in ("overlaps", "contains"):
|
||||
qsrc, qtgt = tree.query(arr, predicate=predicate)
|
||||
for s, t in zip(qsrc.tolist(), qtgt.tolist()):
|
||||
if s != t:
|
||||
pairs.add((s, t) if s < t else (t, s))
|
||||
|
||||
# For each loser (lower priority) the higher-priority neighbours to subtract.
|
||||
higher: dict[int, list[int]] = defaultdict(list)
|
||||
for a, b in pairs:
|
||||
winner, loser = (a, b) if rank[a] < rank[b] else (b, a)
|
||||
higher[loser].append(winner)
|
||||
|
||||
out = list(geoms)
|
||||
# Process losers from highest priority down, so every subtracted neighbour is
|
||||
# already finalised.
|
||||
for i in sorted(higher, key=lambda idx: rank[idx]):
|
||||
cut = unary_union([out[j] for j in higher[i]])
|
||||
trimmed = out[i].difference(cut)
|
||||
if not trimmed.is_valid:
|
||||
trimmed = make_valid(trimmed)
|
||||
# Keep all polygonal parts: these geometries are in WGS84 degrees, so an
|
||||
# area threshold here would wrongly drop everything but the largest part
|
||||
# and re-open the very gaps the seam fix closed.
|
||||
trimmed = _polygonal(trimmed)
|
||||
if trimmed is not None and not trimmed.is_empty:
|
||||
out[i] = trimmed
|
||||
return [(pc, out[i]) for i, (pc, _) in enumerate(items)]
|
||||
|
||||
|
||||
def _round_coords(coords, ndigits=6):
|
||||
if coords and isinstance(coords[0], (int, float)):
|
||||
return [round(coords[0], ndigits), round(coords[1], ndigits)]
|
||||
return [_round_coords(c, ndigits) for c in coords]
|
||||
|
||||
|
||||
def _geojson_geometry(geom) -> dict | None:
|
||||
"""Serialize a WGS84 polygon/multipolygon to a 6dp GeoJSON dict, or None."""
|
||||
geom = _polygonal(geom if geom.is_valid else make_valid(geom))
|
||||
if geom is None or geom.is_empty:
|
||||
return None
|
||||
gj = mapping(geom)
|
||||
return {"type": gj["type"], "coordinates": _round_coords(gj["coordinates"])}
|
||||
|
||||
|
||||
def write_district_geojson(
|
||||
postcodes: dict[str, Polygon | MultiPolygon], output_dir: Path
|
||||
) -> int:
|
||||
"""Group postcodes by district, write GeoJSON files. Returns file count."""
|
||||
"""Group postcodes by district, write GeoJSON files. Returns file count.
|
||||
|
||||
Before writing, the postcode polygons are converted to their final WGS84 form
|
||||
and made a partition (overlaps removed) so the output never has two postcodes
|
||||
covering the same ground.
|
||||
"""
|
||||
units_dir = output_dir / "units"
|
||||
tmp_units_dir = output_dir / "units.tmp"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -166,38 +361,46 @@ def write_district_geojson(
|
|||
shutil.rmtree(tmp_units_dir)
|
||||
tmp_units_dir.mkdir(parents=True)
|
||||
|
||||
skipped: list[str] = []
|
||||
|
||||
# Pass 1: convert every postcode to its final WGS84 geometry (simplify, snap,
|
||||
# sliver-rescue, multi-part preserved). Sorted → deterministic de-overlap
|
||||
# priority. to_wgs84_geojson_multi returns None only for a genuinely empty
|
||||
# input, which is skipped and reported rather than aborting a multi-hour run.
|
||||
converted: list[tuple[str, Polygon | MultiPolygon]] = []
|
||||
for pc in sorted(postcodes):
|
||||
gj = to_wgs84_geojson_multi(postcodes[pc])
|
||||
if gj is None:
|
||||
skipped.append(pc)
|
||||
continue
|
||||
converted.append((pc, shape(gj)))
|
||||
|
||||
# Remove overlap strips so the output is a clean partition.
|
||||
converted = _resolve_overlaps(converted)
|
||||
|
||||
by_district: dict[str, list[tuple[str, Polygon | MultiPolygon]]] = defaultdict(list)
|
||||
for pc, geom in postcodes.items():
|
||||
for pc, geom in converted:
|
||||
parts = pc.split()
|
||||
district = parts[0] if parts else pc[:4]
|
||||
by_district[district].append((pc, geom))
|
||||
|
||||
file_count = 0
|
||||
seen_postcodes: set[str] = set()
|
||||
for district, entries in tqdm(
|
||||
sorted(by_district.items()), desc="Writing GeoJSON", unit="file"
|
||||
):
|
||||
features = []
|
||||
for pc, geom in sorted(entries, key=lambda x: x[0]):
|
||||
if pc in seen_postcodes:
|
||||
raise ValueError(f"Duplicate postcode boundary feature: {pc}")
|
||||
seen_postcodes.add(pc)
|
||||
geojson_geom = to_wgs84_geojson(geom)
|
||||
geojson_geom = _geojson_geometry(geom)
|
||||
if geojson_geom is None:
|
||||
raise ValueError(f"Postcode boundary collapsed to empty geometry: {pc}")
|
||||
written_geom = shape(geojson_geom)
|
||||
if written_geom.is_empty or not written_geom.is_valid:
|
||||
raise ValueError(
|
||||
f"Invalid postcode boundary geometry after output: {pc}"
|
||||
)
|
||||
mapit_code = pc.replace(" ", "")
|
||||
skipped.append(pc)
|
||||
continue
|
||||
features.append(
|
||||
{
|
||||
"type": "Feature",
|
||||
"geometry": geojson_geom,
|
||||
"properties": {
|
||||
"postcodes": pc,
|
||||
"mapit_code": mapit_code,
|
||||
"mapit_code": pc.replace(" ", ""),
|
||||
},
|
||||
}
|
||||
)
|
||||
|
|
@ -211,6 +414,14 @@ def write_district_geojson(
|
|||
json.dump(collection, f, separators=(",", ":"))
|
||||
file_count += 1
|
||||
|
||||
if skipped:
|
||||
preview = ", ".join(skipped[:10])
|
||||
suffix = " …" if len(skipped) > 10 else ""
|
||||
print(
|
||||
f" Skipped {len(skipped)} postcode(s) with degenerate (sub-grid) "
|
||||
f"geometry: {preview}{suffix}"
|
||||
)
|
||||
|
||||
if units_dir.exists():
|
||||
shutil.rmtree(units_dir)
|
||||
tmp_units_dir.replace(units_dir)
|
||||
|
|
|
|||
|
|
@ -85,19 +85,42 @@ def _claim_inspire_parcels(
|
|||
uprn_pts = shp_points(points)
|
||||
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")
|
||||
|
||||
# First priority: parcels that physically contain UPRNs. Majority vote
|
||||
# resolves blocks of flats or overlapping parcel data.
|
||||
# First priority: parcels that physically contain UPRNs. A parcel holding
|
||||
# UPRNs from a single postcode goes wholly to that postcode. A parcel shared
|
||||
# by several postcodes (a block of flats spanning postcodes, or overlapping
|
||||
# parcel data) is split between them via a sub-Voronoi over their own UPRNs
|
||||
# clipped to the parcel — so EVERY contained postcode keeps part of the
|
||||
# parcel. A bare majority vote would hand the whole parcel to one winner and
|
||||
# leave the losers' UPRNs trapped inside claimed land, dropping them from
|
||||
# both this claim and the `remaining` polygon handed to Voronoi downstream.
|
||||
cand_postcodes: dict[int, list[str]] = defaultdict(list)
|
||||
cand_point_idx: dict[int, list[int]] = defaultdict(list)
|
||||
for pi, ci in zip(pt_idx, cand_idx):
|
||||
cand_postcodes[ci].append(postcodes[pi])
|
||||
cand_point_idx[ci].append(pi)
|
||||
|
||||
points_f64 = points.astype(np.float64, copy=False)
|
||||
contained_parts: dict[str, list] = defaultdict(list)
|
||||
contained_scores: Counter[str] = Counter()
|
||||
for ci, pc_list in cand_postcodes.items():
|
||||
pc_counts = Counter(pc_list)
|
||||
winner, votes = pc_counts.most_common(1)[0]
|
||||
contained_parts[winner].append(parcels[ci])
|
||||
contained_scores[winner] += votes
|
||||
if len(pc_counts) == 1:
|
||||
winner = next(iter(pc_counts))
|
||||
contained_parts[winner].append(parcels[ci])
|
||||
contained_scores[winner] += pc_counts[winner]
|
||||
continue
|
||||
# Shared parcel: sub-Voronoi over the contained UPRNs so each postcode
|
||||
# present keeps a fragment instead of being absorbed by the winner.
|
||||
sub_idx = cand_point_idx[ci]
|
||||
sub_points = points_f64[sub_idx]
|
||||
sub_postcodes = [postcodes[pi] for pi in sub_idx]
|
||||
for pc, geom in compute_voronoi_regions(
|
||||
sub_points, sub_postcodes, parcels[ci]
|
||||
).items():
|
||||
cleaned = _clean_polygonal(geom)
|
||||
if cleaned is not None:
|
||||
contained_parts[pc].append(cleaned)
|
||||
contained_scores[pc] += pc_counts[pc]
|
||||
|
||||
contained_claimed = _merge_parts_by_postcode(contained_parts)
|
||||
contained_claims = sorted(
|
||||
|
|
@ -109,7 +132,6 @@ def _claim_inspire_parcels(
|
|||
# each to the nearest UPRN/postcode so parcel boundaries carry more of the
|
||||
# visible postcode shape; Voronoi is then limited to roads, parks, water, and
|
||||
# any other non-parcel gaps.
|
||||
points_f64 = points.astype(np.float64, copy=False)
|
||||
contained_union = _union_claims(contained_claims)
|
||||
nearest_tree = cKDTree(points_f64)
|
||||
nearest_parts: dict[str, list] = defaultdict(list)
|
||||
|
|
@ -235,11 +257,11 @@ def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
|
|||
return None
|
||||
if len(polys) == 1:
|
||||
return polys[0]
|
||||
return MultiPolygon(
|
||||
[
|
||||
p
|
||||
for g in polys
|
||||
for p in (g.geoms if g.geom_type == "MultiPolygon" else [g])
|
||||
]
|
||||
)
|
||||
# Union (not bare MultiPolygon construction): make_valid can emit
|
||||
# overlapping polygonal parts, and a MultiPolygon of overlapping parts is
|
||||
# invalid — it double-counts area and makes the next `.difference()` raise
|
||||
# a TopologyException that aborts the OA (and, in parallel mode, the
|
||||
# worker). unary_union merges them into a valid geometry.
|
||||
merged = unary_union(polys)
|
||||
return merged if not merged.is_empty else None
|
||||
return None
|
||||
|
|
|
|||
|
|
@ -11,12 +11,20 @@ import pytest
|
|||
from shapely.geometry import MultiPolygon, Polygon, box
|
||||
from shapely.ops import unary_union
|
||||
|
||||
from .fragments_cache import (
|
||||
fragments_cache_is_fresh,
|
||||
load_fragments,
|
||||
save_fragments,
|
||||
)
|
||||
from .__main__ import _oa_fragments, _process_oas
|
||||
from .inspire import build_inspire_index
|
||||
from .oa_boundaries import parse_gpkg_geometry
|
||||
from .greenspace import subtract_greenspace
|
||||
from .output import (
|
||||
_fill_holes,
|
||||
merge_fragments,
|
||||
to_wgs84_geojson,
|
||||
to_wgs84_geojson_multi,
|
||||
write_district_geojson,
|
||||
)
|
||||
from .process_oa import _extract_polygonal, process_oa
|
||||
|
|
@ -173,6 +181,52 @@ class TestWhitespacePostcodes:
|
|||
|
||||
assert loaded_df["PCDS"].to_list() == ["AA1 1AB"]
|
||||
|
||||
def test_remapped_terminated_postcode_adopts_successor_oa(self, tmp_path):
|
||||
"""When a terminated postcode is remapped to its active successor, the
|
||||
remapped seed point must carry the SUCCESSOR's OA (and coords), not the
|
||||
terminated postcode's original OA. Pre-fix the row kept OA21CD of the
|
||||
terminated postcode, seeding the successor into an OA it doesn't belong
|
||||
to and splitting its boundary across OAs."""
|
||||
# Terminated AA1 1AA sits in OA E00000001. Its nearest active successor
|
||||
# AA1 1AB lives in a DIFFERENT OA (E00000002) far away.
|
||||
uprns = pl.DataFrame(
|
||||
{
|
||||
"GRIDGB1E": [500010],
|
||||
"GRIDGB1N": [180010],
|
||||
"PCDS": ["AA1 1AA"],
|
||||
"OA21CD": ["E00000001"],
|
||||
}
|
||||
)
|
||||
uprn_path = tmp_path / "uprn.parquet"
|
||||
uprns.write_parquet(uprn_path)
|
||||
arcgis = pl.DataFrame(
|
||||
{
|
||||
"pcds": ["AA1 1AA", "AA1 1AB"],
|
||||
"east1m": [500010, 500030],
|
||||
"north1m": [180010, 180020],
|
||||
# AA1 1AA terminated → only AA1 1AB is an active successor, and
|
||||
# it belongs to a different OA than the terminated postcode.
|
||||
"oa21cd": ["E00000001", "E00000002"],
|
||||
"doterm": ["2020-01-01", None],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
}
|
||||
)
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
arcgis.write_parquet(arcgis_path)
|
||||
|
||||
loaded_df, offsets = load_uprns(uprn_path, arcgis_path)
|
||||
|
||||
# The remapped point must be grouped under the successor's OA, not the
|
||||
# terminated postcode's OA.
|
||||
assert "E00000002" in offsets, "Successor OA missing — remap kept old OA"
|
||||
assert "E00000001" not in offsets, (
|
||||
"Remapped point still lives in the terminated postcode's OA"
|
||||
)
|
||||
points, postcodes = get_oa_uprns(loaded_df, offsets, "E00000002")
|
||||
assert postcodes == ["AA1 1AB"]
|
||||
# It should also adopt the successor's authoritative coordinates.
|
||||
assert points.tolist() == [[500030.0, 180020.0]]
|
||||
|
||||
def test_arcgis_filters_to_active_english_postcodes(self, tmp_path):
|
||||
uprns = pl.DataFrame(
|
||||
{
|
||||
|
|
@ -617,6 +671,32 @@ class TestProcessOAInspireParcelAssignment:
|
|||
for _, geom in fragments:
|
||||
assert geom.difference(oa_geom).area < 0.01
|
||||
|
||||
def test_shared_parcel_keeps_every_contained_postcode(self):
|
||||
"""A single parcel containing UPRNs for [A, A, B] must yield a fragment
|
||||
for BOTH A and B. Pre-fix the majority winner (A) claimed the whole
|
||||
parcel, excluding it from `remaining`, so B's UPRNs were trapped inside
|
||||
claimed land and B vanished entirely (no fragment)."""
|
||||
oa_geom = box(0, 0, 100, 100)
|
||||
parcel = box(0, 0, 100, 100) # one parcel covering the whole OA
|
||||
points = np.array(
|
||||
[
|
||||
[20, 50], # postcode A
|
||||
[30, 50], # postcode A (majority)
|
||||
[80, 50], # postcode B (minority — would be dropped pre-fix)
|
||||
]
|
||||
)
|
||||
postcodes = ["A", "A", "B"]
|
||||
|
||||
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
|
||||
frag_dict = dict(fragments)
|
||||
|
||||
assert "A" in frag_dict, "Majority postcode A must keep a fragment"
|
||||
assert "B" in frag_dict, "Minority postcode B must not be dropped"
|
||||
assert frag_dict["A"].area > 0
|
||||
assert frag_dict["B"].area > 0
|
||||
# The split must partition the parcel without overlap.
|
||||
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _extract_polygonal helper
|
||||
|
|
@ -656,6 +736,21 @@ class TestExtractPolygonal:
|
|||
|
||||
assert _extract_polygonal(LineString([(0, 0), (1, 1)])) is None
|
||||
|
||||
def test_overlapping_collection_unioned_to_valid(self):
|
||||
"""A GeometryCollection with OVERLAPPING polygons must be unioned into a
|
||||
VALID geometry (not a raw MultiPolygon, which would be invalid and crash
|
||||
the next .difference()), and must not double-count the overlap area."""
|
||||
from shapely.geometry import GeometryCollection
|
||||
|
||||
a = box(0, 0, 100, 100)
|
||||
b = box(50, 50, 150, 150) # overlaps a by 50x50
|
||||
result = _extract_polygonal(GeometryCollection([a, b]))
|
||||
assert result is not None
|
||||
assert result.is_valid
|
||||
assert result.area == pytest.approx(unary_union([a, b]).area)
|
||||
# And the formerly-crashing op now works:
|
||||
assert result.difference(box(0, 0, 10, 10)).is_valid
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge case: merge_fragments handles single-OA postcodes
|
||||
|
|
@ -763,12 +858,12 @@ class TestParseGpkgGeometry:
|
|||
|
||||
|
||||
class TestFillHoles:
|
||||
"""_fill_holes must remove all interior holes from polygons."""
|
||||
"""_fill_holes fills small artifact holes but keeps large (real-enclosed) ones."""
|
||||
|
||||
def test_polygon_with_hole(self):
|
||||
"""A polygon with an interior ring should become a solid polygon."""
|
||||
def test_small_artifact_hole_filled(self):
|
||||
"""A small (<1000 m²) interior ring is an artifact and gets filled."""
|
||||
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
||||
hole = [(30, 30), (70, 30), (70, 70), (30, 70), (30, 30)]
|
||||
hole = [(40, 40), (60, 40), (60, 60), (40, 60), (40, 40)] # 20x20 = 400 m²
|
||||
poly_with_hole = Polygon(outer, [hole])
|
||||
assert len(list(poly_with_hole.interiors)) == 1
|
||||
result = _fill_holes(poly_with_hole)
|
||||
|
|
@ -776,6 +871,15 @@ class TestFillHoles:
|
|||
assert len(list(result.interiors)) == 0
|
||||
assert result.area == pytest.approx(Polygon(outer).area)
|
||||
|
||||
def test_large_hole_kept(self):
|
||||
"""A large (>=1000 m²) hole is likely a real enclosed postcode — keep it."""
|
||||
outer = [(0, 0), (100, 0), (100, 100), (0, 100), (0, 0)]
|
||||
hole = [(20, 20), (80, 20), (80, 80), (20, 80), (20, 20)] # 60x60 = 3600 m²
|
||||
poly_with_hole = Polygon(outer, [hole])
|
||||
result = _fill_holes(poly_with_hole)
|
||||
assert len(list(result.interiors)) == 1
|
||||
assert result.area == pytest.approx(10000 - 3600)
|
||||
|
||||
def test_multipolygon_with_holes(self):
|
||||
"""A MultiPolygon where each part has holes should have all holes removed."""
|
||||
outer1 = [(0, 0), (50, 0), (50, 50), (0, 50), (0, 0)]
|
||||
|
|
@ -944,3 +1048,356 @@ class TestGreenspaceHolePreserved:
|
|||
merged = result["TEST1"]
|
||||
assert len(list(merged.interiors)) == 1
|
||||
assert merged.area == pytest.approx(10000 - 1600, rel=0.05)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# merge_fragments keeps substantial detached parts (no OA-seam coverage gaps)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestKeepDetachedParts:
|
||||
"""A postcode split across an OA seam (railway/river) must keep both parts
|
||||
instead of dropping all but the largest, which left ~1.8% uncovered gaps."""
|
||||
|
||||
def test_far_apart_parts_both_kept(self):
|
||||
# Two 50x50m blocks 30m apart — wider than the 10m merge buffer.
|
||||
a = box(0, 0, 50, 50) # 2500 m²
|
||||
b = box(80, 0, 130, 50) # 2500 m², 30m gap
|
||||
geom = merge_fragments([("AA1 1AA", a), ("AA1 1AA", b)])["AA1 1AA"]
|
||||
assert geom.geom_type == "MultiPolygon"
|
||||
assert len(geom.geoms) == 2
|
||||
assert geom.area == pytest.approx(5000, rel=0.01)
|
||||
|
||||
def test_tiny_noise_part_dropped(self):
|
||||
main = box(0, 0, 100, 100) # 10000 m²
|
||||
noise = box(200, 200, 205, 205) # 25 m² < 100 m² threshold
|
||||
geom = merge_fragments([("AA1 1AA", main), ("AA1 1AA", noise)])["AA1 1AA"]
|
||||
assert geom.geom_type == "Polygon"
|
||||
assert geom.area == pytest.approx(10000, rel=0.01)
|
||||
|
||||
|
||||
class TestMultiPolygonOutput:
|
||||
"""to_wgs84_geojson_multi / the writer must emit MultiPolygon for split
|
||||
postcodes (the Rust server + loader already parse MultiPolygon)."""
|
||||
|
||||
def test_multipolygon_preserves_all_parts(self):
|
||||
from shapely.geometry import shape
|
||||
|
||||
mp = MultiPolygon(
|
||||
[
|
||||
box(530000, 180000, 530100, 180100),
|
||||
box(531000, 180000, 531100, 180100),
|
||||
]
|
||||
)
|
||||
gj = to_wgs84_geojson_multi(mp)
|
||||
assert gj["type"] == "MultiPolygon"
|
||||
assert len(gj["coordinates"]) == 2
|
||||
rt = shape(gj)
|
||||
assert rt.is_valid and not rt.is_empty
|
||||
assert len(rt.geoms) == 2
|
||||
|
||||
def test_single_part_stays_polygon(self):
|
||||
gj = to_wgs84_geojson_multi(box(530000, 180000, 530100, 180100))
|
||||
assert gj["type"] == "Polygon"
|
||||
|
||||
def test_writer_emits_multipolygon_feature(self, tmp_path):
|
||||
mp = MultiPolygon(
|
||||
[
|
||||
box(530000, 180000, 530100, 180100),
|
||||
box(531000, 180000, 531100, 180100),
|
||||
]
|
||||
)
|
||||
assert write_district_geojson({"AA1 1AA": mp}, tmp_path) == 1
|
||||
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
assert coll["features"][0]["geometry"]["type"] == "MultiPolygon"
|
||||
|
||||
|
||||
class TestOutputPartition:
|
||||
"""The writer must emit a partition: overlapping postcodes are made disjoint
|
||||
(no two cover the same ground) without dropping an active postcode."""
|
||||
|
||||
def test_overlapping_postcodes_made_disjoint(self, tmp_path):
|
||||
from shapely.geometry import shape
|
||||
|
||||
a = box(530000, 180000, 530100, 180100)
|
||||
b = box(530090, 180000, 530200, 180100) # overlaps `a` in a 10m strip
|
||||
assert a.intersection(b).area > 0 # precondition: they overlap
|
||||
|
||||
write_district_geojson({"AA1 1AA": a, "AA1 1AB": b}, tmp_path)
|
||||
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
geoms = {
|
||||
f["properties"]["postcodes"]: shape(f["geometry"])
|
||||
for f in coll["features"]
|
||||
}
|
||||
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
|
||||
# Disjoint interiors (share at most an edge).
|
||||
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
|
||||
0.0, abs=1e-12
|
||||
)
|
||||
assert all(g.area > 0 for g in geoms.values())
|
||||
|
||||
def test_enclosed_postcode_makes_container_a_donut(self, tmp_path):
|
||||
"""A postcode fully INSIDE another must stay disjoint: the smaller (inner)
|
||||
keeps its area, the container gets a hole. A plain `overlaps` query misses
|
||||
containment, so this is the regression guard for that fix."""
|
||||
from shapely.geometry import shape
|
||||
|
||||
outer = box(530000, 180000, 530300, 180300) # 90,000 m²
|
||||
inner = box(530100, 180100, 530200, 180200) # 10,000 m², fully inside outer
|
||||
assert outer.contains(inner) # precondition
|
||||
|
||||
write_district_geojson({"AA1 1AA": outer, "AA1 1AB": inner}, tmp_path)
|
||||
coll = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
geoms = {
|
||||
f["properties"]["postcodes"]: shape(f["geometry"])
|
||||
for f in coll["features"]
|
||||
}
|
||||
assert set(geoms) == {"AA1 1AA", "AA1 1AB"} # neither dropped
|
||||
assert geoms["AA1 1AA"].intersection(geoms["AA1 1AB"]).area == pytest.approx(
|
||||
0.0, abs=1e-12
|
||||
)
|
||||
# Container is now a donut around the enclosed postcode.
|
||||
assert geoms["AA1 1AA"].geom_type == "Polygon"
|
||||
assert len(list(geoms["AA1 1AA"].interiors)) == 1
|
||||
assert geoms["AA1 1AB"].area > 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# InspireIndex must return the same candidates as a brute-force bbox scan
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestInspireIndex:
|
||||
"""The grid index replaces a per-OA linear scan of all parcel bboxes; it must
|
||||
return an identical candidate set (and order) so Phase 3 output is unchanged."""
|
||||
|
||||
@staticmethod
|
||||
def _brute(bboxes, box):
|
||||
e0, n0, e1, n1 = box
|
||||
mask = (
|
||||
(bboxes[:, 2] >= e0)
|
||||
& (bboxes[:, 0] <= e1)
|
||||
& (bboxes[:, 3] >= n0)
|
||||
& (bboxes[:, 1] <= n1)
|
||||
)
|
||||
return np.where(mask)[0]
|
||||
|
||||
def test_matches_brute_force_over_random_queries(self):
|
||||
rng = np.random.default_rng(0)
|
||||
x = rng.uniform(0, 10000, 5000)
|
||||
y = rng.uniform(0, 10000, 5000)
|
||||
w = rng.uniform(1, 60, 5000) # all <= 500m cell → CSR path
|
||||
h = rng.uniform(1, 60, 5000)
|
||||
bboxes = np.column_stack([x, y, x + w, y + h]).astype(np.float64)
|
||||
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
|
||||
|
||||
for _ in range(400):
|
||||
cx, cy = rng.uniform(0, 10000), rng.uniform(0, 10000)
|
||||
sz = float(rng.choice([30.0, 200.0, 1000.0, 3000.0]))
|
||||
box = (cx, cy, cx + sz, cy + sz)
|
||||
got = idx.candidate_indices(box)
|
||||
expected = np.sort(self._brute(bboxes, box))
|
||||
assert np.array_equal(got, expected)
|
||||
|
||||
def test_oversized_parcel_is_found(self):
|
||||
# A parcel larger than a cell goes to the overflow list, not the grid;
|
||||
# a query deep inside it (away from the small parcels) must still find it.
|
||||
bboxes = np.array(
|
||||
[
|
||||
[0.0, 0.0, 5000.0, 5000.0], # 5km parcel >> 500m cell
|
||||
[100.0, 100.0, 120.0, 120.0],
|
||||
[4000.0, 4000.0, 4020.0, 4020.0],
|
||||
]
|
||||
)
|
||||
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
|
||||
box = (2000.0, 2000.0, 2050.0, 2050.0)
|
||||
got = idx.candidate_indices(box)
|
||||
assert 0 in got
|
||||
assert np.array_equal(got, np.sort(self._brute(bboxes, box)))
|
||||
|
||||
def test_no_overlap_returns_empty(self):
|
||||
bboxes = np.array([[0.0, 0.0, 10.0, 10.0], [20.0, 20.0, 30.0, 30.0]])
|
||||
idx = build_inspire_index(bboxes, None, None, cell_size=500.0)
|
||||
assert len(idx.candidate_indices((100.0, 100.0, 110.0, 110.0))) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parallel OA processing must match the sequential result exactly
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParallelProcessing:
|
||||
"""_process_oas across workers must produce the same fragments as workers=1.
|
||||
Uses single-postcode OAs (fast path), so it exercises the chunking + WKB
|
||||
round-trip + fork machinery without needing INSPIRE data."""
|
||||
|
||||
@staticmethod
|
||||
def _inputs(n_oas=60):
|
||||
import pyarrow as pa
|
||||
|
||||
oa_geoms = {
|
||||
f"E{i:08d}": box(i * 100.0, 0.0, i * 100.0 + 50.0, 50.0)
|
||||
for i in range(n_oas)
|
||||
}
|
||||
codes = sorted(oa_geoms)
|
||||
east, north, pcs = [], [], []
|
||||
offsets = {}
|
||||
pos = 0
|
||||
for i, code in enumerate(codes):
|
||||
east += [i * 100.0 + 10.0, i * 100.0 + 20.0]
|
||||
north += [10.0, 20.0]
|
||||
pcs += [f"AA{i % 5} {i % 9}AA"] * 2 # one postcode per OA → fast path
|
||||
offsets[code] = (pos, pos + 2)
|
||||
pos += 2
|
||||
return (
|
||||
codes,
|
||||
oa_geoms,
|
||||
np.array(east),
|
||||
np.array(north),
|
||||
pa.array(pcs, type=pa.large_string()),
|
||||
offsets,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _norm(frags):
|
||||
return sorted((pc, geom.wkb_hex) for pc, geom in frags)
|
||||
|
||||
def test_parallel_matches_sequential(self):
|
||||
codes, oa, east, north, pcs, offs = self._inputs()
|
||||
seq, s1 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=1)
|
||||
par, s2 = _process_oas(codes, oa, east, north, pcs, offs, None, workers=3)
|
||||
assert len(seq) == len(codes) # one fragment per single-postcode OA
|
||||
assert s1 == s2 == len(codes)
|
||||
assert self._norm(seq) == self._norm(par)
|
||||
|
||||
def test_oa_failure_is_tagged_with_oa_code(self):
|
||||
"""A failure inside per-OA processing must re-raise with the OA code, so a
|
||||
single bad OA is attributable instead of an anonymous worker abort."""
|
||||
# Missing OA in the geoms dict → KeyError, wrapped with the OA code.
|
||||
with pytest.raises(RuntimeError, match="E00099999"):
|
||||
_oa_fragments("E00099999", {}, None, None, None, {}, None)
|
||||
|
||||
|
||||
class TestDegenerateGeometryHandling:
|
||||
"""Every active postcode must keep a boundary (validate_outputs is strict),
|
||||
so a sub-grid sliver is fattened rather than dropped. A genuinely empty
|
||||
geometry is skipped without aborting the whole write (the 10h regression)."""
|
||||
|
||||
# Three near-collinear vertices in BNG: bbox ~28m x 7m but area ~0.04 m²,
|
||||
# i.e. AL10 0TU. Without the rescue it snaps to empty at output precision.
|
||||
SLIVER = Polygon(
|
||||
[(523045.34, 209625.56), (523040.47, 209624.33), (523017.0, 209618.42)]
|
||||
)
|
||||
|
||||
def test_sliver_is_rescued_to_valid_geometry(self):
|
||||
from shapely.geometry import shape
|
||||
|
||||
result = to_wgs84_geojson(self.SLIVER)
|
||||
assert result is not None, "sliver must be rescued, not dropped"
|
||||
rt = shape(result)
|
||||
assert not rt.is_empty
|
||||
assert rt.is_valid
|
||||
|
||||
def test_collinear_zero_area_input_is_rescued(self):
|
||||
"""A zero-area collinear 'polygon' (can't be cleaned to a polygon) must
|
||||
still be rescued via the representative-point fallback, not dropped."""
|
||||
from shapely.geometry import shape
|
||||
|
||||
degenerate = Polygon(
|
||||
[(523000, 209600), (523010, 209600), (523020, 209600), (523000, 209600)]
|
||||
)
|
||||
assert degenerate.area == 0.0
|
||||
result = to_wgs84_geojson(degenerate)
|
||||
assert result is not None, "degenerate input must be rescued, not dropped"
|
||||
rt = shape(result)
|
||||
assert not rt.is_empty
|
||||
assert rt.is_valid
|
||||
|
||||
def test_sliver_postcode_present_in_output(self, tmp_path):
|
||||
postcodes = {
|
||||
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||
"AA1 1AB": self.SLIVER, # must survive
|
||||
}
|
||||
file_count = write_district_geojson(postcodes, tmp_path)
|
||||
assert file_count == 1
|
||||
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
written = {f["properties"]["postcodes"] for f in collection["features"]}
|
||||
assert written == {"AA1 1AA", "AA1 1AB"}
|
||||
|
||||
def test_empty_geometry_skipped_not_raised(self, tmp_path):
|
||||
# The last-resort safety net: an unrescuable (empty) geometry is skipped
|
||||
# so one bad postcode can never abort a multi-hour run.
|
||||
postcodes = {
|
||||
"AA1 1AA": box(530000, 180000, 530100, 180100),
|
||||
"AA1 1AB": Polygon(), # genuinely empty
|
||||
}
|
||||
file_count = write_district_geojson(postcodes, tmp_path)
|
||||
assert file_count == 1
|
||||
collection = json.loads((tmp_path / "units" / "AA1.geojson").read_text())
|
||||
written = {f["properties"]["postcodes"] for f in collection["features"]}
|
||||
assert written == {"AA1 1AA"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# fragments_cache round-trips Phase 3 output and validates freshness
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestFragmentsCache:
|
||||
"""Persisting Phase 3 lets a crashed run resume without the ~10h OA loop."""
|
||||
|
||||
def test_round_trip_preserves_postcodes_and_geometry(self, tmp_path):
|
||||
fragments = [
|
||||
("AA1 1AA", box(0, 0, 100, 100)),
|
||||
("AA1 1AB", box(200, 200, 250, 260)),
|
||||
# A postcode spanning multiple OAs appears as repeated entries.
|
||||
("AA1 1AA", box(100, 0, 150, 100)),
|
||||
("AA1 1AC", MultiPolygon([box(0, 0, 10, 10), box(20, 20, 30, 30)])),
|
||||
]
|
||||
cache = tmp_path / "fragments_cache.parquet"
|
||||
save_fragments(cache, fragments)
|
||||
loaded = load_fragments(cache)
|
||||
|
||||
assert [pc for pc, _ in loaded] == [pc for pc, _ in fragments]
|
||||
for (_, original), (_, restored) in zip(fragments, loaded):
|
||||
assert restored.equals(original)
|
||||
|
||||
def test_save_is_atomic_no_tmp_left_behind(self, tmp_path):
|
||||
cache = tmp_path / "fragments_cache.parquet"
|
||||
save_fragments(cache, [("AA1 1AA", box(0, 0, 1, 1))])
|
||||
assert cache.exists()
|
||||
assert not (tmp_path / "fragments_cache.parquet.tmp").exists()
|
||||
|
||||
def test_missing_cache_is_not_fresh(self, tmp_path):
|
||||
cache = tmp_path / "fragments_cache.parquet"
|
||||
inp = tmp_path / "uprn.parquet"
|
||||
inp.write_text("x")
|
||||
assert fragments_cache_is_fresh(cache, [inp]) is False
|
||||
|
||||
def test_cache_newer_than_inputs_is_fresh(self, tmp_path):
|
||||
import os
|
||||
|
||||
inp = tmp_path / "uprn.parquet"
|
||||
inp.write_text("x")
|
||||
cache = tmp_path / "fragments_cache.parquet"
|
||||
cache.write_text("c")
|
||||
os.utime(inp, (1_000, 1_000))
|
||||
os.utime(cache, (2_000, 2_000))
|
||||
assert fragments_cache_is_fresh(cache, [inp, None]) is True
|
||||
|
||||
def test_cache_older_than_any_input_is_stale(self, tmp_path):
|
||||
import os
|
||||
|
||||
inp = tmp_path / "oa.gpkg"
|
||||
inp.write_text("x")
|
||||
cache = tmp_path / "fragments_cache.parquet"
|
||||
cache.write_text("c")
|
||||
os.utime(cache, (1_000, 1_000))
|
||||
os.utime(inp, (2_000, 2_000)) # input touched after the cache
|
||||
assert fragments_cache_is_fresh(cache, [inp]) is False
|
||||
|
||||
def test_missing_input_is_ignored(self, tmp_path):
|
||||
cache = tmp_path / "fragments_cache.parquet"
|
||||
cache.write_text("c")
|
||||
# arcgis is optional/absent — it cannot have invalidated the cache.
|
||||
assert fragments_cache_is_fresh(cache, [tmp_path / "absent.parquet"]) is True
|
||||
|
|
|
|||
|
|
@ -79,13 +79,42 @@ def load_uprns(
|
|||
)
|
||||
|
||||
if mapping is not None and mapping.height > 0:
|
||||
uprns = (
|
||||
uprns.join(
|
||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||
# Remap terminated postcodes to their nearest active successor. The
|
||||
# successor generally lives in a DIFFERENT OA (and at different grid
|
||||
# coordinates), so the remapped point must adopt the successor's
|
||||
# authoritative OA/coords — keeping the terminated postcode's original
|
||||
# OA would seed the successor into an OA it doesn't belong to, splitting
|
||||
# its boundary across OAs. Genuine (non-remapped) UPRN rows keep their
|
||||
# own OA, since a live postcode can legitimately span several OAs.
|
||||
uprns = uprns.join(
|
||||
mapping.lazy(), left_on="PCDS", right_on="old_postcode", how="left"
|
||||
).with_columns(pl.col("new_postcode").is_not_null().alias("_remapped"))
|
||||
if active_postcode_points is not None:
|
||||
successor_oa = active_postcode_points.rename(
|
||||
{
|
||||
"PCDS": "new_postcode",
|
||||
"GRIDGB1E": "_succ_e",
|
||||
"GRIDGB1N": "_succ_n",
|
||||
"OA21CD": "_succ_oa",
|
||||
}
|
||||
)
|
||||
.with_columns(pl.coalesce("new_postcode", "PCDS").alias("PCDS"))
|
||||
.select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
)
|
||||
uprns = uprns.join(successor_oa, on="new_postcode", how="left").with_columns(
|
||||
pl.when("_remapped")
|
||||
.then(pl.col("_succ_e"))
|
||||
.otherwise(pl.col("GRIDGB1E"))
|
||||
.alias("GRIDGB1E"),
|
||||
pl.when("_remapped")
|
||||
.then(pl.col("_succ_n"))
|
||||
.otherwise(pl.col("GRIDGB1N"))
|
||||
.alias("GRIDGB1N"),
|
||||
pl.when("_remapped")
|
||||
.then(pl.col("_succ_oa"))
|
||||
.otherwise(pl.col("OA21CD"))
|
||||
.alias("OA21CD"),
|
||||
)
|
||||
uprns = uprns.with_columns(
|
||||
pl.coalesce("new_postcode", "PCDS").alias("PCDS")
|
||||
).select("GRIDGB1E", "GRIDGB1N", "PCDS", "OA21CD")
|
||||
|
||||
if active_postcode_points is not None:
|
||||
active_postcodes = active_postcode_points.select("PCDS").unique()
|
||||
|
|
@ -149,3 +178,37 @@ def get_oa_uprns(
|
|||
)
|
||||
postcodes = sub["PCDS"].to_list()
|
||||
return points, postcodes
|
||||
|
||||
|
||||
def extract_uprn_arrays(df: pl.DataFrame):
|
||||
"""Convert the UPRN DataFrame to fork-shareable numpy/Arrow arrays.
|
||||
|
||||
Returns ``(east, north, postcodes)``: two float64 ndarrays and a contiguous
|
||||
pyarrow string Array. Multiprocessing workers slice these per OA via
|
||||
:func:`get_oa_uprns_arrays` **without touching polars**, which avoids the
|
||||
fork-after-threads deadlock hazard of polars' rayon pool. Being plain
|
||||
numpy/Arrow buffers (not millions of Python objects), they are shared by
|
||||
``fork`` copy-on-write rather than duplicated ~1GB per worker.
|
||||
"""
|
||||
import pyarrow as pa
|
||||
|
||||
east = np.ascontiguousarray(df["GRIDGB1E"].to_numpy(), dtype=np.float64)
|
||||
north = np.ascontiguousarray(df["GRIDGB1N"].to_numpy(), dtype=np.float64)
|
||||
postcodes = df["PCDS"].to_arrow()
|
||||
if isinstance(postcodes, pa.ChunkedArray):
|
||||
postcodes = postcodes.combine_chunks()
|
||||
return east, north, postcodes
|
||||
|
||||
|
||||
def get_oa_uprns_arrays(
|
||||
east: np.ndarray,
|
||||
north: np.ndarray,
|
||||
postcodes,
|
||||
offsets: dict[str, tuple[int, int]],
|
||||
oa_code: str,
|
||||
) -> tuple[np.ndarray, list[str]]:
|
||||
"""Like :func:`get_oa_uprns`, but slices the fork-shareable arrays from
|
||||
:func:`extract_uprn_arrays` (no polars), so it is safe to call in workers."""
|
||||
s, e = offsets[oa_code]
|
||||
points = np.column_stack([east[s:e], north[s:e]])
|
||||
return points, postcodes.slice(s, e - s).to_pylist()
|
||||
|
|
|
|||
|
|
@ -11,9 +11,9 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
|
||||
from pipeline.transform.price_estimation.index import build_index
|
||||
from pipeline.transform.price_estimation.knn import (
|
||||
KNN_BLEND_WEIGHT,
|
||||
build_knn_pool,
|
||||
knn_median_psm,
|
||||
)
|
||||
|
|
@ -115,7 +115,10 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:
|
|||
.clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
|
||||
.exp()
|
||||
)
|
||||
.fill_null(pl.col("input_price").cast(pl.Float64))
|
||||
# Keep null when the index can't be interpolated, matching production
|
||||
# (estimate.py ships null there). compute_metrics filters to finite
|
||||
# positive predictions, so these rows correctly drop from the Index n
|
||||
# rather than silently degrading to the Naive prediction.
|
||||
.alias("predicted"),
|
||||
)
|
||||
return test
|
||||
|
|
@ -265,13 +268,12 @@ def main():
|
|||
f" kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
# Blend: (1-w)*index + w*kNN where both available
|
||||
# Blend with the exact shipped estimator (stability gate + last-price cap +
|
||||
# null-when-no-index) so the "Blended" stage reflects production accuracy.
|
||||
# input_price is the backtest equivalent of production's "Last known price".
|
||||
index_est = test["predicted"].to_numpy().astype(np.float64)
|
||||
knn_valid = np.isfinite(knn_est) & (knn_est > 0)
|
||||
blended = np.where(
|
||||
knn_valid & np.isfinite(index_est),
|
||||
(1 - KNN_BLEND_WEIGHT) * index_est + KNN_BLEND_WEIGHT * knn_est,
|
||||
np.where(np.isfinite(index_est), index_est, knn_est),
|
||||
blended = guarded_blend_estimates(
|
||||
index_est, knn_est, test["input_price"].cast(pl.Float64).to_numpy()
|
||||
)
|
||||
|
||||
actual = test["actual_price"].to_numpy().astype(np.float64)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ from tqdm import tqdm
|
|||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
)
|
||||
|
|
@ -431,6 +433,17 @@ def build_index(
|
|||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Re-anchor every repeat-sales dict to the global base year before any
|
||||
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
|
||||
# log-index 0 at its OWN earliest year, so cells with shorter histories
|
||||
# are measured from a later origin; combining them key-by-key would
|
||||
# otherwise average level-incompatible numbers. The hedonic fallback is
|
||||
# already anchored at min_year, so we align everything to min_year.
|
||||
national_idx = reanchor_dict(national_idx, min_year)
|
||||
area_idx = reanchor_dicts(area_idx, min_year)
|
||||
district_idx = reanchor_dicts(district_idx, min_year)
|
||||
sector_idx = reanchor_dicts(sector_idx, min_year)
|
||||
|
||||
# Shrinkage: national -> hedonic first, then hierarchical
|
||||
print(" Applying shrinkage...")
|
||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||
|
|
|
|||
|
|
@ -13,6 +13,51 @@ SPATIAL_NEIGHBORS = 5
|
|||
SPATIAL_BLEND_K = 30
|
||||
|
||||
|
||||
def _base_value(index: dict[int, float], base_year: int) -> float:
|
||||
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
|
||||
|
||||
Each repeat-sales dict is anchored to 0 at its OWN earliest year, so its
|
||||
values are log-levels relative to that origin. To express it on a common
|
||||
origin we need its value at the shared `base_year`:
|
||||
- exact hit: use it directly;
|
||||
- base_year before the dict's history: back-fill, i.e. the earliest known
|
||||
value (which is 0.0 by construction). We cannot observe the level move
|
||||
between the global base and a later-starting cell, so we assume none,
|
||||
matching forward_fill's back-fill convention;
|
||||
- base_year inside a gap / after history: forward-fill the most recent
|
||||
prior value.
|
||||
"""
|
||||
if base_year in index:
|
||||
return index[base_year]
|
||||
years = sorted(index)
|
||||
if not years or base_year < years[0]:
|
||||
return index[years[0]] if years else 0.0
|
||||
prior = [y for y in years if y <= base_year]
|
||||
return index[prior[-1]]
|
||||
|
||||
|
||||
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
|
||||
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
|
||||
|
||||
Subtracting the same constant from every year preserves all within-dict
|
||||
year-to-year differences, so estimate.py's (current - sale) semantics are
|
||||
unchanged; it only fixes the cross-dict level mismatch before blending.
|
||||
"""
|
||||
if not index:
|
||||
return index
|
||||
shift = _base_value(index, base_year)
|
||||
if shift == 0.0:
|
||||
return index
|
||||
return {y: v - shift for y, v in index.items()}
|
||||
|
||||
|
||||
def reanchor_dicts(
|
||||
indices: dict[str, dict[int, float]], base_year: int
|
||||
) -> dict[str, dict[int, float]]:
|
||||
"""Re-anchor every index dict in a mapping to the common `base_year`."""
|
||||
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
|
||||
|
||||
|
||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||
"""Shrink dict values toward parent using n/(n+k) weighting.
|
||||
|
||||
|
|
|
|||
|
|
@ -158,6 +158,53 @@ def test_transform_crime_writes_by_year_output(tmp_path):
|
|||
assert serious[2024] == 12.0
|
||||
|
||||
|
||||
def test_transform_crime_headline_is_mean_of_per_year_bars(tmp_path):
|
||||
"""The avg/yr headline must equal the average of the by-year chart bars, i.e.
|
||||
the simple mean of each year's annualised count -- NOT a month-weighted pooled
|
||||
rate. They diverge when years have uneven partial-month coverage."""
|
||||
crime_dir = tmp_path / "crime"
|
||||
jan23 = crime_dir / "2023-01"
|
||||
jan24 = crime_dir / "2024-01"
|
||||
feb24 = crime_dir / "2024-02"
|
||||
for d in (jan23, jan24, feb24):
|
||||
d.mkdir(parents=True)
|
||||
|
||||
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
||||
# 2023: 6 burglaries in 1 month -> 6 * 12 / 1 = 72/yr.
|
||||
(jan23 / "2023-01-test-force-street.csv").write_text(
|
||||
"\n".join(
|
||||
[header]
|
||||
+ [
|
||||
f"{i},2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"
|
||||
for i in range(1, 7)
|
||||
]
|
||||
)
|
||||
+ "\n"
|
||||
)
|
||||
# 2024: 2 burglaries across 2 months -> 2 * 12 / 2 = 12/yr.
|
||||
(jan24 / "2024-01-test-force-street.csv").write_text(
|
||||
"\n".join([header, "7,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
|
||||
)
|
||||
(feb24 / "2024-02-test-force-street.csv").write_text(
|
||||
"\n".join([header, "8,2024-02,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
|
||||
)
|
||||
|
||||
output = tmp_path / "crime.parquet"
|
||||
by_year_output = tmp_path / "crime_by_year.parquet"
|
||||
transform_crime(crime_dir, output, by_year_output)
|
||||
|
||||
# Mean of per-year bars = (72 + 12) / 2 = 42.0.
|
||||
# The old pooled rate (8 incidents / 3 months * 12 = 32.0) would be wrong.
|
||||
avg = pl.read_parquet(output).to_dicts()[0]
|
||||
assert avg["Burglary (avg/yr)"] == 42.0
|
||||
|
||||
by_year = pl.read_parquet(by_year_output).row(0, named=True)
|
||||
burglary = {p["year"]: p["count"] for p in by_year["Burglary (by year)"]}
|
||||
assert burglary == {2023: 72.0, 2024: 12.0}
|
||||
# Headline equals the mean of the bars it summarises.
|
||||
assert avg["Burglary (avg/yr)"] == sum(burglary.values()) / len(burglary)
|
||||
|
||||
|
||||
def test_transform_crime_fails_without_valid_months(tmp_path):
|
||||
crime_dir = tmp_path / "crime"
|
||||
month_dir = crime_dir / "2024-01"
|
||||
|
|
|
|||
|
|
@ -252,6 +252,63 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
|||
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
||||
|
||||
|
||||
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
|
||||
# P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
|
||||
# (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
|
||||
# distinct years across all postcodes, but only ONE year for P. The headline
|
||||
# must divide by P's own years-present (1), equalling its single by-year bar
|
||||
# (24/yr) -- not by the global span (2), which would deflate it to 12/yr.
|
||||
# The two squares are equal-area, so area normalisation leaves counts as-is.
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units,
|
||||
{
|
||||
"AB1": [
|
||||
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
|
||||
_square_feature("AB1 1AB", 5000, 5000, 5010, 5010),
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
# P: 2 burglaries in a single 2024 month -> 24/yr bar, present in 1 year.
|
||||
_write_month(
|
||||
crime,
|
||||
"2024-01",
|
||||
[
|
||||
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
||||
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
||||
],
|
||||
)
|
||||
# Q: 1 burglary in a far-back 2014 month -> widens the type's global span to
|
||||
# two years without adding any incident to P.
|
||||
_write_month(crime, "2014-01", [_crime_row("2014-01", 5005, 5005, "Burglary")])
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
|
||||
by_year_rows = {
|
||||
r["postcode"]: r for r in pl.read_parquet(by_year).to_dicts()
|
||||
}
|
||||
|
||||
# P's headline equals the simple mean of its own bars (just the 2024 bar).
|
||||
p_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AA"]["Burglary (by year)"]}
|
||||
assert p_bars == {2024: pytest.approx(24.0, abs=0.05)}
|
||||
# Per-postcode denominator (1) -> 24.0. The old global denominator (2 years
|
||||
# across all postcodes) would have deflated this to 12.0.
|
||||
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(24.0, abs=0.05)
|
||||
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(
|
||||
sum(p_bars.values()) / len(p_bars), abs=0.05
|
||||
)
|
||||
|
||||
# Q likewise: its sole 2014 bar -> 12/yr, divided by its own 1 year = 12.0.
|
||||
q_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AB"]["Burglary (by year)"]}
|
||||
assert q_bars == {2014: pytest.approx(12.0, abs=0.05)}
|
||||
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
|
||||
|
||||
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
|
|
|
|||
|
|
@ -149,6 +149,7 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
|
|||
"town_city": ["Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F"],
|
||||
"old_new": ["N", "N"],
|
||||
"ppd_category": ["A", "A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
|
|
@ -201,6 +202,7 @@ def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
|
|||
"town_city": ["Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F"],
|
||||
"old_new": ["N", "N"],
|
||||
"ppd_category": ["A", "A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
|
|
@ -235,6 +237,7 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
|||
"town_city": ["Exampletown"],
|
||||
"duration": ["F"],
|
||||
"old_new": ["N"],
|
||||
"ppd_category": ["A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
|
|
@ -259,6 +262,93 @@ def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path
|
|||
]
|
||||
|
||||
|
||||
def test_run_excludes_category_b_sales_from_price_aggregations(tmp_path: Path):
|
||||
# Category B entries (repossessions, bulk/portfolio, power-of-sale) must not
|
||||
# pollute latest_price / historical_prices, but the property still survives
|
||||
# via its standard Category A sales.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row())
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [200_000, 250_000, 5_000_000],
|
||||
"date_of_transfer": [date(2020, 2, 3), date(2022, 2, 3), date(2024, 2, 3)],
|
||||
"property_type": ["T", "T", "T"],
|
||||
"postcode": ["AA1 1AA", "AA1 1AA", "AA1 1AA"],
|
||||
"paon": ["1", "1", "1"],
|
||||
"saon": [None, None, None],
|
||||
"street": ["Example Street", "Example Street", "Example Street"],
|
||||
"locality": [None, None, None],
|
||||
"town_city": ["Exampletown", "Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F", "F"],
|
||||
"old_new": ["N", "N", "N"],
|
||||
# The latest (5M) sale is a Category B bulk/portfolio transfer.
|
||||
"ppd_category": ["A", "A", "B"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# Only the two Category A sales survive; the 5M Category B transfer is dropped.
|
||||
assert df.get_column("latest_price").to_list() == [250_000]
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
|
||||
|
||||
def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: Path):
|
||||
# A new-build whose earliest sale is below MIN_PRICE must still take that early
|
||||
# year as its EXACT construction date, while latest_price uses only the
|
||||
# quality-passing (>=MIN_PRICE) sale.
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row())
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [30_000, 300_000],
|
||||
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
|
||||
"property_type": ["T", "T"],
|
||||
"postcode": ["AA1 1AA", "AA1 1AA"],
|
||||
"paon": ["1", "1"],
|
||||
"saon": [None, None],
|
||||
"street": ["Example Street", "Example Street"],
|
||||
"locality": [None, None],
|
||||
"town_city": ["Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F"],
|
||||
"old_new": ["Y", "Y"],
|
||||
"ppd_category": ["A", "A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# Construction year is the genuine earliest transfer (2015), flagged EXACT,
|
||||
# even though that sale is below MIN_PRICE.
|
||||
assert df.get_column("construction_age_band").to_list() == [2015]
|
||||
assert df.get_column("is_construction_date_approximate").to_list() == [0]
|
||||
# latest_price uses only the >=MIN_PRICE sale; the sub-MIN sale is excluded.
|
||||
assert df.get_column("latest_price").to_list() == [300_000]
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [1]
|
||||
|
||||
|
||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||
import polars as pl
|
||||
|
||||
|
|
|
|||
|
|
@ -13,7 +13,9 @@ from pipeline.transform.merge import (
|
|||
_active_english_postcode_area,
|
||||
_build_unmatched_listing_seed_rows,
|
||||
_canonical_postcode_expr,
|
||||
_coalesce_direct_epc_columns,
|
||||
_filter_to_active_english_postcodes,
|
||||
_join_area_side_tables,
|
||||
_finalize_listings,
|
||||
_integrate_listings,
|
||||
_match_direct_epc,
|
||||
|
|
@ -506,6 +508,25 @@ def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
|
|||
assert loaded["_actual_lat"].to_list() == [51.5]
|
||||
|
||||
|
||||
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
|
||||
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
|
||||
# the same digits-only key as `_normalize_uprn` on the candidate side, so
|
||||
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
|
||||
# and stripping non-digits would yield "1000233369560" (a bogus trailing
|
||||
# zero) which never collides with the candidate key "100023336956".
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
_sample_listings_frame().with_columns(
|
||||
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
|
||||
).write_parquet(listings_path)
|
||||
_stub_arcgis(arcgis_path)
|
||||
|
||||
loaded = _load_listings_for_merge(listings_path, arcgis_path)
|
||||
|
||||
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
|
||||
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
|
||||
|
||||
|
||||
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
|
|
@ -697,6 +718,110 @@ def test_normalize_uprn_handles_types_and_floats() -> None:
|
|||
assert _normalize_uprn(float("nan")) is None
|
||||
|
||||
|
||||
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
|
||||
# The raw property value is fill_null("No") upstream, so a plain coalesce
|
||||
# would let a non-null "No" override a directly-matched listing "Yes".
|
||||
# "Former council house" should fire if EITHER side says "Yes".
|
||||
none_col = [None] * 5
|
||||
wide = pl.LazyFrame(
|
||||
{
|
||||
"was_council_house": ["No", "Yes", "No", None, None],
|
||||
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
|
||||
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
|
||||
"current_energy_rating": [None, "C", "D", None, None],
|
||||
"_direct_current_energy_rating": ["B", "A", None, "E", None],
|
||||
# _coalesce_direct_epc_columns coalesces every pair in
|
||||
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
|
||||
"epc_address": none_col,
|
||||
"_direct_epc_address": none_col,
|
||||
"potential_energy_rating": none_col,
|
||||
"_direct_potential_energy_rating": none_col,
|
||||
"total_floor_area": none_col,
|
||||
"_direct_total_floor_area": none_col,
|
||||
"number_habitable_rooms": none_col,
|
||||
"_direct_number_habitable_rooms": none_col,
|
||||
"floor_height": none_col,
|
||||
"_direct_floor_height": none_col,
|
||||
"construction_age_band": none_col,
|
||||
"_direct_construction_age_band": none_col,
|
||||
"is_construction_date_approximate": none_col,
|
||||
"_direct_is_construction_date_approximate": none_col,
|
||||
}
|
||||
)
|
||||
|
||||
result = _coalesce_direct_epc_columns(wide).collect()
|
||||
|
||||
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
|
||||
# Plain coalesce (raw wins when non-null) is untouched for other columns.
|
||||
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
|
||||
|
||||
|
||||
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
|
||||
# The crime table is LEFT-joined per postcode; a postcode absent from it
|
||||
# must NOT be fabricated as "zero crime" (the safest value). When every
|
||||
# per-type column is null the Serious/Minor rollups must stay null.
|
||||
base = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA", "BB2 2BB"],
|
||||
"lsoa21": ["E01000001", "E01000002"],
|
||||
"Local Authority District code (2024)": ["E09000001", "E09000002"],
|
||||
"pcon": ["E14000001", "E14000002"],
|
||||
}
|
||||
)
|
||||
|
||||
def _by_postcode(extra: dict) -> pl.LazyFrame:
|
||||
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
|
||||
|
||||
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
|
||||
crime = pl.LazyFrame(
|
||||
{
|
||||
"postcode": ["AA1 1AA"],
|
||||
"Violence and sexual offences (avg/yr)": [1.0],
|
||||
"Robbery (avg/yr)": [2.0],
|
||||
"Burglary (avg/yr)": [3.0],
|
||||
"Possession of weapons (avg/yr)": [4.0],
|
||||
"Anti-social behaviour (avg/yr)": [1.0],
|
||||
"Criminal damage and arson (avg/yr)": [1.0],
|
||||
"Shoplifting (avg/yr)": [1.0],
|
||||
"Bicycle theft (avg/yr)": [1.0],
|
||||
"Theft from the person (avg/yr)": [1.0],
|
||||
"Other theft (avg/yr)": [1.0],
|
||||
"Vehicle crime (avg/yr)": [1.0],
|
||||
"Public order (avg/yr)": [1.0],
|
||||
"Drugs (avg/yr)": [1.0],
|
||||
"Other crime (avg/yr)": [1.0],
|
||||
}
|
||||
)
|
||||
|
||||
joined = _join_area_side_tables(
|
||||
base,
|
||||
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
|
||||
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
|
||||
crime=crime,
|
||||
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
|
||||
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
|
||||
poi_counts=_by_postcode({}),
|
||||
noise=_by_postcode({}),
|
||||
school_proximity=_by_postcode({}),
|
||||
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
|
||||
tree_density=None,
|
||||
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
|
||||
).collect()
|
||||
|
||||
by_postcode = {
|
||||
row["postcode"]: row
|
||||
for row in joined.select(
|
||||
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
|
||||
).iter_rows(named=True)
|
||||
}
|
||||
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
|
||||
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
|
||||
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
|
||||
# Missing postcode: rollups stay null rather than fabricating 0.0.
|
||||
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
|
||||
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
|
||||
|
||||
|
||||
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"postcode": "AA1 1AA",
|
||||
|
|
|
|||
|
|
@ -1,9 +1,44 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.poi_proximity import (
|
||||
POI_GROUPS_2KM,
|
||||
_build_poi_category_groups,
|
||||
_dynamic_poi_metric_renames,
|
||||
_groceries_categories,
|
||||
)
|
||||
from pipeline.utils.poi_counts import count_pois_per_postcode
|
||||
|
||||
|
||||
def test_groceries_2km_counts_geolytix_brand_categories() -> None:
|
||||
"""The static groceries 2km count must include GEOLYTIX brand POIs.
|
||||
|
||||
GEOLYTIX stores the brand (e.g. "Tesco") in `category` with group
|
||||
"Groceries" and never emits the literal "Supermarket"; matching only the
|
||||
OSM strings counts the supermarket but drops the brand store.
|
||||
"""
|
||||
postcodes = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["SW1A 1AA"],
|
||||
"lat": [51.5010],
|
||||
"lon": [-0.1416],
|
||||
}
|
||||
)
|
||||
pois = pl.DataFrame(
|
||||
{
|
||||
"category": ["Tesco", "Supermarket"],
|
||||
"group": ["Groceries", "Groceries"],
|
||||
"lat": [51.5011, 51.5012],
|
||||
"lng": [-0.1417, -0.1418],
|
||||
}
|
||||
)
|
||||
|
||||
groups_2km = {**POI_GROUPS_2KM, "groceries": _groceries_categories(pois)}
|
||||
result = count_pois_per_postcode(postcodes, pois, groups=groups_2km, radius_km=2)
|
||||
|
||||
# Both the GEOLYTIX brand ("Tesco") and the OSM "Supermarket" must count.
|
||||
# Pre-fix the static list was ["Greengrocer", "Supermarket", "Convenience
|
||||
# Store"], so "Tesco" was dropped and this was 1.
|
||||
assert result["groceries_2km"][0] == 2
|
||||
|
||||
|
||||
def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
||||
|
|
|
|||
|
|
@ -1,6 +1,10 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.transform.transform_poi import transform_grocery_retail_points
|
||||
from pipeline.transform.transform_poi import (
|
||||
_load_ofsted_ratings,
|
||||
_school_icon_category_expr,
|
||||
transform_grocery_retail_points,
|
||||
)
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_outputs_chain_categories():
|
||||
|
|
@ -112,6 +116,33 @@ def test_transform_grocery_retail_points_merges_cooperative_societies():
|
|||
]
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_pools_small_coop_societies_before_cutoff():
|
||||
# Each Co-op society has <5 in-England stores; only after normalising to the
|
||||
# shared "Co-op" brand do they clear MIN_GROCERY_CHAIN_LOCATIONS together.
|
||||
societies = [
|
||||
"Central England Co-operative",
|
||||
"Lincolnshire Co-operative",
|
||||
"The Southern Co-operative",
|
||||
"Midcounties Co-operative",
|
||||
"Heart of England Co-operative",
|
||||
]
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": list(range(1, len(societies) + 1)),
|
||||
"retailer": societies,
|
||||
"fascia": ["The Co-operative Food"] * len(societies),
|
||||
"store_name": [f"Co-op Test {i}" for i in range(1, len(societies) + 1)],
|
||||
"long_wgs": [-0.141] * len(societies),
|
||||
"lat_wgs": [51.515] * len(societies),
|
||||
}
|
||||
)
|
||||
|
||||
pois = transform_grocery_retail_points(raw)
|
||||
|
||||
assert pois.height == len(societies)
|
||||
assert pois["category"].unique().to_list() == ["Co-op"]
|
||||
|
||||
|
||||
def test_transform_grocery_retail_points_accepts_base_fascias():
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
|
|
@ -177,3 +208,87 @@ def test_transform_grocery_retail_points_includes_unmapped_chains_with_five_loca
|
|||
{"id": "glx-4", "category": "Tian Tian", "icon_category": "Tian Tian"},
|
||||
{"id": "glx-5", "category": "Tian Tian", "icon_category": "Tian Tian"},
|
||||
]
|
||||
|
||||
|
||||
def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
|
||||
# URNs 1-4: graded results map straight through. URNs 5-6: no usable graded
|
||||
# grade (null/"Not judged") but a good/outstanding ungraded outcome, incl.
|
||||
# the "(Concerns)"/"(Improving)" suffixes. URN 7: genuinely "Not judged".
|
||||
# URN 8: a real grade 3 must NOT be overridden by an ungraded outcome.
|
||||
ofsted_path = tmp_path / "ofsted.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"URN": [1, 2, 3, 4, 5, 6, 7, 8],
|
||||
"Latest OEIF overall effectiveness": [
|
||||
"1",
|
||||
"2",
|
||||
"3",
|
||||
"4",
|
||||
None,
|
||||
"Not judged",
|
||||
"Not judged",
|
||||
"3",
|
||||
],
|
||||
"Ungraded inspection overall outcome": [
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
"School remains Outstanding",
|
||||
"School remains Good (Concerns)",
|
||||
None,
|
||||
"School remains Outstanding",
|
||||
],
|
||||
}
|
||||
).write_parquet(ofsted_path)
|
||||
|
||||
ratings = (
|
||||
_load_ofsted_ratings(ofsted_path)
|
||||
.collect()
|
||||
.sort("urn")
|
||||
.to_dicts()
|
||||
)
|
||||
|
||||
assert ratings == [
|
||||
{"urn": 1, "ofsted_rating": "Outstanding"},
|
||||
{"urn": 2, "ofsted_rating": "Good"},
|
||||
{"urn": 3, "ofsted_rating": "Requires improvement"},
|
||||
{"urn": 4, "ofsted_rating": "Inadequate"},
|
||||
{"urn": 5, "ofsted_rating": "Outstanding"},
|
||||
{"urn": 6, "ofsted_rating": "Good"},
|
||||
{"urn": 7, "ofsted_rating": "Not judged"},
|
||||
{"urn": 8, "ofsted_rating": "Requires improvement"},
|
||||
]
|
||||
|
||||
|
||||
def test_school_icon_category_handles_one_sided_age_ranges():
|
||||
# gias._format_age_range emits "up to {high}", "{low}+" and "{low}–{high}".
|
||||
# All three (plus null) must classify, not fall through to "School".
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"phase": [None, None, None, None, None],
|
||||
"type_group": [None, None, None, None, None],
|
||||
# "up to 5" -> nursery; "16+" -> sixth form; "3–18" -> all-through;
|
||||
# "4–11" -> primary; null age_range with null phase -> "School".
|
||||
"age_range": ["up to 5", "16+", "3–18", "4–11", None],
|
||||
},
|
||||
# Production reads these from a scanned parquet as String; an all-null
|
||||
# Python list would otherwise infer the Null dtype and break .str ops.
|
||||
schema_overrides={
|
||||
"phase": pl.String,
|
||||
"type_group": pl.String,
|
||||
"age_range": pl.String,
|
||||
},
|
||||
)
|
||||
|
||||
categories = df.select(
|
||||
_school_icon_category_expr().alias("category")
|
||||
)["category"].to_list()
|
||||
|
||||
assert categories == [
|
||||
"Nursery school",
|
||||
"Sixth form",
|
||||
"All-through school",
|
||||
"Primary school",
|
||||
"School",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -1289,22 +1289,27 @@ def transform_grocery_retail_points(
|
|||
)
|
||||
df = df.filter(pl.Series(mask))
|
||||
|
||||
eligible_retailers = (
|
||||
df.group_by("retailer")
|
||||
# Normalise to the display brand FIRST so the ~16 Co-op society retailer
|
||||
# names pool into one "Co-op" before the chain-eligibility cutoff; otherwise
|
||||
# small societies (<MIN_GROCERY_CHAIN_LOCATIONS stores each) get dropped.
|
||||
df = df.with_columns(
|
||||
pl.col("retailer")
|
||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||
.alias("category")
|
||||
)
|
||||
eligible_categories = (
|
||||
df.group_by("category")
|
||||
.len()
|
||||
.filter(pl.col("len") >= min_chain_locations)
|
||||
.select("retailer")
|
||||
.select("category")
|
||||
)
|
||||
df = df.join(eligible_retailers, on="retailer", how="semi")
|
||||
df = df.join(eligible_categories, on="category", how="semi")
|
||||
|
||||
return df.with_columns(
|
||||
pl.concat_str([pl.lit("glx-"), pl.col("id")]).alias("id"),
|
||||
pl.coalesce(["store_name", "fascia", "retailer"])
|
||||
.str.replace_all("''", "'")
|
||||
.alias("name"),
|
||||
pl.col("retailer")
|
||||
.map_elements(normalize_grocery_retailer, return_dtype=pl.String)
|
||||
.alias("category"),
|
||||
pl.struct(["fascia", "retailer"])
|
||||
.map_elements(
|
||||
lambda row: normalize_grocery_icon_category(row["fascia"], row["retailer"]),
|
||||
|
|
@ -1338,10 +1343,16 @@ def _school_icon_category_expr() -> pl.Expr:
|
|||
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
|
||||
# primary") so we normalise before matching.
|
||||
phase = pl.col("phase").str.to_lowercase()
|
||||
# age_range is "<min>–<max>" using an em-dash; both ends may be missing.
|
||||
age_parts = pl.col("age_range").str.split_exact("–", 1)
|
||||
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
|
||||
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
|
||||
# gias._format_age_range emits three shapes: "<low>–<high>" (em-dash),
|
||||
# "up to <high>" (high-only) and "<low>+" (low-only). Extract the leading
|
||||
# integer as low and the trailing integer as high, then suppress the wrong
|
||||
# end for the one-sided shapes so they don't collapse to a single bound.
|
||||
age = pl.col("age_range")
|
||||
leading = age.str.extract(r"^\s*(\d+)", 1).cast(pl.Int32, strict=False)
|
||||
trailing = age.str.extract(r"(\d+)\s*$", 1).cast(pl.Int32, strict=False)
|
||||
# "up to N": no low bound; "N+": no high bound.
|
||||
min_age = pl.when(age.str.starts_with("up to")).then(None).otherwise(leading)
|
||||
max_age = pl.when(age.str.ends_with("+")).then(None).otherwise(trailing)
|
||||
return (
|
||||
pl.when(pl.col("type_group") == "Universities")
|
||||
.then(pl.lit("University"))
|
||||
|
|
@ -1386,9 +1397,16 @@ OFSTED_OEIF_LABELS = {
|
|||
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
||||
"""Project the latest OEIF effectiveness grade to a human-readable label,
|
||||
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
|
||||
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
|
||||
only have a report card) is preserved verbatim; null grades drop out."""
|
||||
the conventional Ofsted labels; when there is no usable graded result
|
||||
(null/"Not judged", e.g. schools last seen under the post-2024 ungraded
|
||||
report-card framework) we fall back to "Ungraded inspection overall outcome"
|
||||
so genuinely good/outstanding schools aren't dropped — mirroring
|
||||
school_proximity.classify_good_plus_schools. Remaining nulls drop out."""
|
||||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||||
# See school_proximity: the ungraded outcome carries "School remains Good"/
|
||||
# "School remains Outstanding" (with optional "(Concerns)"/"(Improving)"
|
||||
# suffixes) when the graded column is null/"Not judged".
|
||||
ungraded = pl.col("Ungraded inspection overall outcome").cast(pl.Utf8, strict=False)
|
||||
label = (
|
||||
pl.when(grade_col == "1")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||
|
|
@ -1398,6 +1416,10 @@ def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
|||
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
|
||||
.when(grade_col == "4")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
|
||||
.when(ungraded.str.starts_with("School remains Outstanding"))
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||
.when(ungraded.str.starts_with("School remains Good"))
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
|
||||
.when(grade_col == "Not judged")
|
||||
.then(pl.lit("Not judged"))
|
||||
.otherwise(None)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue