improve
This commit is contained in:
parent
8688b7475e
commit
e8345cbdc1
40 changed files with 1980 additions and 904 deletions
|
|
@ -1,9 +1,13 @@
|
|||
import json
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pytest
|
||||
import shapely
|
||||
from pyproj import Transformer
|
||||
|
||||
from pipeline.transform.crime_spatial import transform_crime_spatial
|
||||
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
|
||||
|
||||
_TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True)
|
||||
|
||||
|
|
@ -82,7 +86,10 @@ def test_buffer_overlap_counts_for_each_postcode(tmp_path):
|
|||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year)
|
||||
# Pin the 50m buffer the geometry above was designed around (the production
|
||||
# default is now 100m). The three squares are equal-area, so area
|
||||
# normalisation leaves the counts unchanged.
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
rows = {
|
||||
r["postcode"]: r
|
||||
|
|
@ -127,7 +134,7 @@ def test_by_year_annualises_and_rolls_up(tmp_path):
|
|||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year)
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
by_year_df = pl.read_parquet(by_year)
|
||||
assert by_year_df.height == 1
|
||||
|
|
@ -145,3 +152,130 @@ def test_by_year_annualises_and_rolls_up(tmp_path):
|
|||
# 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
|
||||
assert serious[2023] == 24.0
|
||||
assert serious[2024] == 12.0
|
||||
|
||||
|
||||
def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
|
||||
# Three postcodes of increasing footprint, each with exactly one incident in
|
||||
# its buffer. Normalisation rescales by median_catchment / buffered_area, so
|
||||
# the smallest scores highest and the median-sized one is unchanged -- i.e.
|
||||
# the metric is a density. Dividing by the *buffered* catchment (not the raw
|
||||
# polygon) means the fixed buffer-ring floor keeps the spread gentle, so the
|
||||
# tiniest postcode is not blown up out of proportion.
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units,
|
||||
{
|
||||
"AB1": [
|
||||
_square_feature("AB1 1AA", 1000, 1000, 1010, 1010), # 10x10
|
||||
_square_feature("AB1 1AB", 3000, 3000, 3010, 3020), # 10x20 (median)
|
||||
_square_feature("AB1 1AC", 5000, 5000, 5020, 5020), # 20x20
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(
|
||||
crime,
|
||||
"2024-01",
|
||||
[
|
||||
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
||||
_crime_row("2024-01", 3005, 3010, "Burglary"),
|
||||
_crime_row("2024-01", 5010, 5010, "Burglary"),
|
||||
],
|
||||
)
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
# Re-derive the expected values from the same buffered catchment areas: each
|
||||
# postcode is 12/yr before normalisation, then x (median_buf / buffered_area).
|
||||
postcodes, polygons = load_postcode_polygons(units)
|
||||
buf_area = {
|
||||
pc: float(shapely.area(shapely.buffer(poly, 50.0, quad_segs=8)))
|
||||
for pc, poly in zip(postcodes, polygons)
|
||||
}
|
||||
median_buf = float(np.median(list(buf_area.values())))
|
||||
expected = {pc: 12.0 * median_buf / buf_area[pc] for pc in buf_area}
|
||||
|
||||
rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
|
||||
for pc, exp in expected.items():
|
||||
assert rows[pc]["Burglary (avg/yr)"] == pytest.approx(exp, abs=0.1)
|
||||
|
||||
# Median catchment unchanged; ordering is by inverse buffered area, but the
|
||||
# buffer-ring floor keeps the spread far below the ~4x raw-area ratio.
|
||||
assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
|
||||
small = rows["AB1 1AA"]["Burglary (avg/yr)"]
|
||||
big = rows["AB1 1AC"]["Burglary (avg/yr)"]
|
||||
assert small > 12.0 > big
|
||||
assert small / big < 1.5
|
||||
|
||||
# by-year series carries the same normalisation.
|
||||
by_year_df = pl.read_parquet(by_year)
|
||||
small_row = by_year_df.filter(pl.col("postcode") == "AB1 1AA").row(0, named=True)
|
||||
assert small_row["Burglary (by year)"] == [
|
||||
{"year": 2024, "count": pytest.approx(expected["AB1 1AA"], abs=0.1)}
|
||||
]
|
||||
|
||||
|
||||
def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
|
||||
# Uneven month coverage across years: 2023 has 1 month (2 incidents -> 24/yr),
|
||||
# 2024 has 2 months (2 incidents -> 12/yr). The headline must be the *simple*
|
||||
# mean of the bars (24+12)/2 = 18, not the month-weighted pooled rate
|
||||
# (4 incidents / 3 months * 12 = 16).
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(
|
||||
crime,
|
||||
"2023-01",
|
||||
[
|
||||
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
||||
_crime_row("2023-01", 1005, 1005, "Burglary"),
|
||||
],
|
||||
)
|
||||
_write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
|
||||
_write_month(crime, "2024-02", [_crime_row("2024-02", 1005, 1005, "Burglary")])
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
avg = pl.read_parquet(output).row(0, named=True)
|
||||
assert avg["Burglary (avg/yr)"] == pytest.approx(18.0, abs=0.05)
|
||||
|
||||
row = pl.read_parquet(by_year).row(0, named=True)
|
||||
bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]}
|
||||
assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}
|
||||
|
||||
|
||||
def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
|
||||
units = tmp_path / "units"
|
||||
_write_boundaries(
|
||||
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
|
||||
)
|
||||
|
||||
crime = tmp_path / "crime"
|
||||
_write_month(
|
||||
crime,
|
||||
"2024-01",
|
||||
[
|
||||
_crime_row("2024-01", 1005, 1005, "Burglary"),
|
||||
_crime_row("2024-01", 1005, 1005, "Cyber fraud"),
|
||||
],
|
||||
)
|
||||
|
||||
output = tmp_path / "crime_by_postcode.parquet"
|
||||
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
|
||||
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
|
||||
|
||||
columns = pl.read_parquet(output).columns
|
||||
# The unknown type is dropped (no column for it) but a warning is emitted.
|
||||
assert "Cyber fraud (avg/yr)" not in columns
|
||||
assert "Burglary (avg/yr)" in columns
|
||||
err = capsys.readouterr().err
|
||||
assert "Cyber fraud" in err
|
||||
assert "WARNING" in err
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue