372 lines
14 KiB
Python
372 lines
14 KiB
Python
import json
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
import pytest
|
|
import shapely
|
|
from pyproj import Transformer
|
|
|
|
from pipeline.transform.crime_spatial import transform_crime_spatial
|
|
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
|
|
|
|
_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
|
|
|
_CSV_HEADER = (
|
|
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,"
|
|
"LSOA code,LSOA name,Crime type,Last outcome category,Context"
|
|
)
|
|
|
|
|
|
def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]:
|
|
lon, lat = _TO_WGS84.transform(x, y)
|
|
return lon, lat
|
|
|
|
|
|
def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict:
|
|
ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)]
|
|
coords = [list(_bng_to_wgs84(x, y)) for x, y in ring]
|
|
return {
|
|
"type": "Feature",
|
|
"properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")},
|
|
"geometry": {"type": "Polygon", "coordinates": [coords]},
|
|
}
|
|
|
|
|
|
def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None:
|
|
units_dir.mkdir(parents=True)
|
|
for district, features in features_by_district.items():
|
|
collection = {"type": "FeatureCollection", "features": features}
|
|
(units_dir / f"{district}.geojson").write_text(json.dumps(collection))
|
|
|
|
|
|
def _crime_row(month: str, x, y, crime_type: str) -> str:
|
|
if x is None or y is None:
|
|
lon, lat = "", ""
|
|
else:
|
|
lon, lat = _bng_to_wgs84(x, y)
|
|
return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"
|
|
|
|
|
|
def _write_month(crime_dir, month: str, rows: list[str]) -> None:
|
|
month_dir = crime_dir / month
|
|
month_dir.mkdir(parents=True)
|
|
body = "\n".join([_CSV_HEADER, *rows]) + "\n"
|
|
(month_dir / f"{month}-test-force-street.csv").write_text(body)
|
|
|
|
|
|
def test_buffer_overlap_counts_for_each_postcode(tmp_path):
|
|
units = tmp_path / "units"
|
|
# A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060].
|
|
_write_boundaries(
|
|
units,
|
|
{
|
|
"AB1": [
|
|
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
|
|
_square_feature("AB1 1AB", 1080, 1000, 1090, 1010),
|
|
_square_feature("AB1 1AC", 5000, 5000, 5010, 5010),
|
|
]
|
|
},
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
_write_month(
|
|
crime,
|
|
"2024-01",
|
|
[
|
|
# In the overlap: 35m east of A, 35m west of B -> counts for both.
|
|
_crime_row("2024-01", 1045, 1005, "Burglary"),
|
|
# 49m east of C's edge -> inside C's buffer.
|
|
_crime_row("2024-01", 5059, 5005, "Robbery"),
|
|
# 51m east of C's edge -> outside every buffer.
|
|
_crime_row("2024-01", 5061, 5005, "Robbery"),
|
|
# No coordinate -> dropped entirely.
|
|
_crime_row("2024-01", None, None, "Anti-social behaviour"),
|
|
],
|
|
)
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
# Pin the 50m buffer the geometry above was designed around (the production
|
|
# default is now 100m). The three squares are equal-area, so area
|
|
# normalisation leaves the counts unchanged.
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
rows = {
|
|
r["postcode"]: r
|
|
for r in pl.read_parquet(output).to_dicts()
|
|
}
|
|
# Single month -> annualised x12.
|
|
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0
|
|
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0
|
|
assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0
|
|
# Only the 49m robbery counts for C; the 51m one and the blank row do not.
|
|
assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0
|
|
assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0
|
|
# Anti-social behaviour had no coordinate -> nobody gets it.
|
|
assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values())
|
|
|
|
|
|
def test_by_year_annualises_and_rolls_up(tmp_path):
|
|
units = tmp_path / "units"
|
|
_write_boundaries(
|
|
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
# Point at the centre of AB1 1AA, well inside its buffer.
|
|
_write_month(
|
|
crime,
|
|
"2023-01",
|
|
[
|
|
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
|
_crime_row("2023-01", 1005, 1005, "Robbery"),
|
|
],
|
|
)
|
|
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
|
|
_write_month(
|
|
crime,
|
|
"2024-02",
|
|
[
|
|
_crime_row("2024-02", 1005, 1005, "Burglary"),
|
|
_crime_row("2024-02", 1005, 1005, "Anti-social behaviour"),
|
|
],
|
|
)
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
by_year_df = pl.read_parquet(by_year)
|
|
assert by_year_df.height == 1
|
|
cols = set(by_year_df.columns)
|
|
assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols
|
|
|
|
row = by_year_df.row(0, named=True)
|
|
burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
|
|
# 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
|
|
assert burglary == [
|
|
{"year": 2023, "count": 12.0},
|
|
{"year": 2024, "count": 12.0},
|
|
]
|
|
serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
|
|
# 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
|
|
assert serious[2023] == 24.0
|
|
assert serious[2024] == 12.0
|
|
|
|
|
|
def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
|
|
# Three postcodes of increasing footprint, each with exactly one incident in
|
|
# its buffer. Normalisation rescales by median_catchment / buffered_area, so
|
|
# the smallest scores highest and the median-sized one is unchanged -- i.e.
|
|
# the metric is a density. Dividing by the *buffered* catchment (not the raw
|
|
# polygon) means the fixed buffer-ring floor keeps the spread gentle, so the
|
|
# tiniest postcode is not blown up out of proportion.
|
|
units = tmp_path / "units"
|
|
_write_boundaries(
|
|
units,
|
|
{
|
|
"AB1": [
|
|
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010), # 10x10
|
|
_square_feature("AB1 1AB", 3000, 3000, 3010, 3020), # 10x20 (median)
|
|
_square_feature("AB1 1AC", 5000, 5000, 5020, 5020), # 20x20
|
|
]
|
|
},
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
_write_month(
|
|
crime,
|
|
"2024-01",
|
|
[
|
|
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
|
_crime_row("2024-01", 3005, 3010, "Burglary"),
|
|
_crime_row("2024-01", 5010, 5010, "Burglary"),
|
|
],
|
|
)
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
# Re-derive the expected values from the same buffered catchment areas: each
|
|
# postcode is 12/yr before normalisation, then x (median_buf / buffered_area).
|
|
postcodes, polygons = load_postcode_polygons(units)
|
|
buf_area = {
|
|
pc: float(shapely.area(shapely.buffer(poly, 50.0, quad_segs=8)))
|
|
for pc, poly in zip(postcodes, polygons)
|
|
}
|
|
median_buf = float(np.median(list(buf_area.values())))
|
|
expected = {pc: 12.0 * median_buf / buf_area[pc] for pc in buf_area}
|
|
|
|
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
|
|
for pc, exp in expected.items():
|
|
assert rows[pc]["Burglary (avg/yr)"] == pytest.approx(exp, abs=0.1)
|
|
|
|
# Median catchment unchanged; ordering is by inverse buffered area, but the
|
|
# buffer-ring floor keeps the spread far below the ~4x raw-area ratio.
|
|
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
|
small = rows["AB1 1AA"]["Burglary (avg/yr)"]
|
|
big = rows["AB1 1AC"]["Burglary (avg/yr)"]
|
|
assert small > 12.0 > big
|
|
assert small / big < 1.5
|
|
|
|
# by-year series carries the same normalisation.
|
|
by_year_df = pl.read_parquet(by_year)
|
|
small_row = by_year_df.filter(pl.col("postcode") == "AB1 1AA").row(0, named=True)
|
|
assert small_row["Burglary (by year)"] == [
|
|
{"year": 2024, "count": pytest.approx(expected["AB1 1AA"], abs=0.1)}
|
|
]
|
|
|
|
|
|
def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
|
# Uneven month coverage across years: 2023 has 1 month (2 incidents -> 24/yr),
|
|
# 2024 has 2 months (2 incidents -> 12/yr). The headline must be the *simple*
|
|
# mean of the bars (24+12)/2 = 18, not the month-weighted pooled rate
|
|
# (4 incidents / 3 months * 12 = 16).
|
|
units = tmp_path / "units"
|
|
_write_boundaries(
|
|
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
_write_month(
|
|
crime,
|
|
"2023-01",
|
|
[
|
|
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
|
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
|
],
|
|
)
|
|
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
|
|
_write_month(crime, "2024-02", [_crime_row("2024-02", 1005, 1005, "Burglary")])
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
avg = pl.read_parquet(output).row(0, named=True)
|
|
assert avg["Burglary (avg/yr)"] == pytest.approx(18.0, abs=0.05)
|
|
|
|
row = pl.read_parquet(by_year).row(0, named=True)
|
|
bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]}
|
|
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
|
|
|
|
|
def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
|
|
# P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
|
|
# (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
|
|
# distinct years across all postcodes, but only ONE year for P. The headline
|
|
# must divide by P's own years-present (1), equalling its single by-year bar
|
|
# (24/yr) -- not by the global span (2), which would deflate it to 12/yr.
|
|
# The two squares are equal-area, so area normalisation leaves counts as-is.
|
|
units = tmp_path / "units"
|
|
_write_boundaries(
|
|
units,
|
|
{
|
|
"AB1": [
|
|
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010),
|
|
_square_feature("AB1 1AB", 5000, 5000, 5010, 5010),
|
|
]
|
|
},
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
# P: 2 burglaries in a single 2024 month -> 24/yr bar, present in 1 year.
|
|
_write_month(
|
|
crime,
|
|
"2024-01",
|
|
[
|
|
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
|
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
|
],
|
|
)
|
|
# Q: 1 burglary in a far-back 2014 month -> widens the type's global span to
|
|
# two years without adding any incident to P.
|
|
_write_month(crime, "2014-01", [_crime_row("2014-01", 5005, 5005, "Burglary")])
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
|
|
by_year_rows = {
|
|
r["postcode"]: r for r in pl.read_parquet(by_year).to_dicts()
|
|
}
|
|
|
|
# P's headline equals the simple mean of its own bars (just the 2024 bar).
|
|
p_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AA"]["Burglary (by year)"]}
|
|
assert p_bars == {2024: pytest.approx(24.0, abs=0.05)}
|
|
# Per-postcode denominator (1) -> 24.0. The old global denominator (2 years
|
|
# across all postcodes) would have deflated this to 12.0.
|
|
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(24.0, abs=0.05)
|
|
assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(
|
|
sum(p_bars.values()) / len(p_bars), abs=0.05
|
|
)
|
|
|
|
# Q likewise: its sole 2014 bar -> 12/yr, divided by its own 1 year = 12.0.
|
|
q_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AB"]["Burglary (by year)"]}
|
|
assert q_bars == {2014: pytest.approx(12.0, abs=0.05)}
|
|
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
|
|
|
|
|
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
|
units = tmp_path / "units"
|
|
_write_boundaries(
|
|
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
_write_month(
|
|
crime,
|
|
"2024-01",
|
|
[
|
|
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
|
_crime_row("2024-01", 1005, 1005, "Cyber fraud"),
|
|
],
|
|
)
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
columns = pl.read_parquet(output).columns
|
|
# The unknown type is dropped (no column for it) but a warning is emitted.
|
|
assert "Cyber fraud (avg/yr)" not in columns
|
|
assert "Burglary (avg/yr)" in columns
|
|
err = capsys.readouterr().err
|
|
assert "Cyber fraud" in err
|
|
assert "WARNING" in err
|
|
|
|
|
|
def test_legacy_crime_types_are_mapped(tmp_path):
|
|
"""Pre-2014 crime-type names are aliased to current equivalents in the
|
|
spatial transform instead of being dropped as unknown types."""
|
|
units = tmp_path / "units"
|
|
_write_boundaries(
|
|
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
|
)
|
|
|
|
crime = tmp_path / "crime"
|
|
_write_month(
|
|
crime,
|
|
"2013-01",
|
|
[
|
|
_crime_row("2013-01", 1005, 1005, "Violent crime"),
|
|
_crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
|
|
],
|
|
)
|
|
|
|
output = tmp_path / "crime_by_postcode.parquet"
|
|
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
|
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
|
|
|
row = pl.read_parquet(output).to_dicts()[0]
|
|
# Single postcode -> area-norm factor 1.0; single month/year -> x12.
|
|
assert row["Violence and sexual offences (avg/yr)"] == 12.0
|
|
assert row["Public order (avg/yr)"] == 12.0
|
|
|
|
by_year_row = pl.read_parquet(by_year).row(0, named=True)
|
|
assert by_year_row["Violence and sexual offences (by year)"] == [
|
|
{"year": 2013, "count": 12.0}
|
|
]
|
|
assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]
|