import json import numpy as np import polars as pl import pytest import shapely from pyproj import Transformer from pipeline.transform.crime_spatial import transform_crime_spatial from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons _TO_WGS84 = Transformer.from_crs("EPSG:27700", "EPSG:4326", always_xy=True) _CSV_HEADER = ( "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location," "LSOA code,LSOA name,Crime type,Last outcome category,Context" ) def _bng_to_wgs84(x: float, y: float) -> tuple[float, float]: lon, lat = _TO_WGS84.transform(x, y) return lon, lat def _square_feature(postcode: str, x0: float, y0: float, x1: float, y1: float) -> dict: ring = [(x0, y0), (x1, y0), (x1, y1), (x0, y1), (x0, y0)] coords = [list(_bng_to_wgs84(x, y)) for x, y in ring] return { "type": "Feature", "properties": {"postcodes": postcode, "mapit_code": postcode.replace(" ", "")}, "geometry": {"type": "Polygon", "coordinates": [coords]}, } def _write_boundaries(units_dir, features_by_district: dict[str, list[dict]]) -> None: units_dir.mkdir(parents=True) for district, features in features_by_district.items(): collection = {"type": "FeatureCollection", "features": features} (units_dir / f"{district}.geojson").write_text(json.dumps(collection)) def _crime_row(month: str, x, y, crime_type: str) -> str: if x is None or y is None: lon, lat = "", "" else: lon, lat = _bng_to_wgs84(x, y) return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U," def _write_month( crime_dir, month: str, rows: list[str], force: str = "test-force" ) -> None: """Write one force's monthly CSV; an empty ``rows`` list still creates the file, which counts as published coverage for that (force, month).""" month_dir = crime_dir / month month_dir.mkdir(parents=True, exist_ok=True) body = "\n".join([_CSV_HEADER, *rows]) + "\n" (month_dir / f"{month}-{force}-street.csv").write_text(body) def _run(tmp_path, crime, units, **kwargs): output = tmp_path / "crime_by_postcode.parquet" by_year = tmp_path / "crime_by_postcode_by_year.parquet" transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0, **kwargs) return pl.read_parquet(output), pl.read_parquet(by_year) def test_buffer_overlap_counts_for_each_postcode(tmp_path): units = tmp_path / "units" # A and B sit 70m apart; their +50m buffers overlap in x in [1030, 1060]. _write_boundaries( units, { "AB1": [ _square_feature("AB1 1AA", 1000, 1000, 1010, 1010), _square_feature("AB1 1AB", 1080, 1000, 1090, 1010), _square_feature("AB1 1AC", 5000, 5000, 5010, 5010), ] }, ) crime = tmp_path / "crime" _write_month( crime, "2024-01", [ # In the overlap: 35m east of A, 35m west of B -> counts for both. _crime_row("2024-01", 1045, 1005, "Burglary"), # 49m east of C's edge -> inside C's buffer. _crime_row("2024-01", 5059, 5005, "Robbery"), # 51m east of C's edge -> outside every buffer. _crime_row("2024-01", 5061, 5005, "Robbery"), # No coordinate -> dropped entirely. _crime_row("2024-01", None, None, "Anti-social behaviour"), ], ) avg_df, _ = _run(tmp_path, crime, units) rows = {r["postcode"]: r for r in avg_df.to_dicts()} # Single covered month -> pooled rate x12. assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0 assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0 assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0 # Only the 49m robbery counts for C; the 51m one and the blank row do not. assert rows["AB1 1AC"]["Robbery (avg/yr)"] == 12.0 assert rows["AB1 1AC"]["Burglary (avg/yr)"] == 0.0 # Anti-social behaviour had no coordinate -> nobody gets it. assert all(r["Anti-social behaviour (avg/yr)"] == 0.0 for r in rows.values()) def test_by_year_annualises_and_rolls_up(tmp_path): units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" # Point at the centre of AB1 1AA, well inside its buffer. _write_month( crime, "2023-01", [ _crime_row("2023-01", 1005, 1005, "Burglary"), _crime_row("2023-01", 1005, 1005, "Robbery"), ], ) _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")]) _write_month( crime, "2024-02", [ _crime_row("2024-02", 1005, 1005, "Burglary"), _crime_row("2024-02", 1005, 1005, "Anti-social behaviour"), ], ) _, by_year_df = _run(tmp_path, crime, units, min_bar_months=1) assert by_year_df.height == 1 cols = set(by_year_df.columns) assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols row = by_year_df.row(0, named=True) burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"]) # 2023: 1 burglary in 1 covered month -> 12/yr; 2024: 2 in 2 months -> 12/yr. assert burglary == [ {"year": 2023, "count": 12.0}, {"year": 2024, "count": 12.0}, ] serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]} # 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12). assert serious[2023] == 24.0 assert serious[2024] == 12.0 # Coverage calendar: both years published, with their month counts. coverage = {c["year"]: c["months"] for c in row["covered_years"]} assert coverage == {2023: 1, 2024: 2} def test_area_normalisation_divides_out_buffered_catchment(tmp_path): # Three postcodes of increasing footprint, each with exactly one incident in # its buffer. Normalisation rescales by median_catchment / buffered_area, so # the smallest scores highest and the median-sized one is unchanged -- i.e. # the metric is a density. Dividing by the *buffered* catchment (not the raw # polygon) means the fixed buffer-ring floor keeps the spread gentle, so the # tiniest postcode is not blown up out of proportion. units = tmp_path / "units" _write_boundaries( units, { "AB1": [ _square_feature("AB1 1AA", 1000, 1000, 1010, 1010), # 10x10 _square_feature("AB1 1AB", 3000, 3000, 3010, 3020), # 10x20 (median) _square_feature("AB1 1AC", 5000, 5000, 5020, 5020), # 20x20 ] }, ) crime = tmp_path / "crime" _write_month( crime, "2024-01", [ _crime_row("2024-01", 1005, 1005, "Burglary"), _crime_row("2024-01", 3005, 3010, "Burglary"), _crime_row("2024-01", 5010, 5010, "Burglary"), ], ) avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1) # Re-derive the expected values from the same buffered catchment areas: each # postcode is 12/yr before normalisation, then x (median_buf / buffered_area). postcodes, polygons = load_postcode_polygons(units) buf_area = { pc: float(shapely.area(shapely.buffer(poly, 50.0, quad_segs=8))) for pc, poly in zip(postcodes, polygons) } median_buf = float(np.median(list(buf_area.values()))) expected = {pc: 12.0 * median_buf / buf_area[pc] for pc in buf_area} rows = {r["postcode"]: r for r in avg_df.to_dicts()} for pc, exp in expected.items(): assert rows[pc]["Burglary (avg/yr)"] == pytest.approx(exp, abs=0.1) # Median catchment unchanged; ordering is by inverse buffered area, but the # buffer-ring floor keeps the spread far below the ~4x raw-area ratio. assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05) small = rows["AB1 1AA"]["Burglary (avg/yr)"] big = rows["AB1 1AC"]["Burglary (avg/yr)"] assert small > 12.0 > big assert small / big < 1.5 # by-year series carries the same normalisation. small_row = by_year_df.filter(pl.col("postcode") == "AB1 1AA").row(0, named=True) assert small_row["Burglary (by year)"] == [ {"year": 2024, "count": pytest.approx(expected["AB1 1AA"], abs=0.1)} ] def test_avg_yr_is_pooled_rate_over_covered_months(tmp_path): # Uneven month coverage across years: 2023 has 1 month (2 incidents), # 2024 has 2 months (2 incidents). The headline is the POOLED annualised # rate over all covered months: 4 incidents / 3 months * 12 = 16/yr -- not # the old mean-of-bars (24+12)/2 = 18, which over-weighted thin years. units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" _write_month( crime, "2023-01", [ _crime_row("2023-01", 1005, 1005, "Burglary"), _crime_row("2023-01", 1005, 1005, "Burglary"), ], ) _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")]) _write_month(crime, "2024-02", [_crime_row("2024-02", 1005, 1005, "Burglary")]) avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1) avg = avg_df.row(0, named=True) assert avg["Burglary (avg/yr)"] == pytest.approx(16.0, abs=0.05) # Bars remain per-year annualised: 2023 -> 24/yr (x12), 2024 -> 12/yr (x6). row = by_year_df.row(0, named=True) bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]} assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)} def test_sporadic_type_is_not_inflated_by_years_present(tmp_path): # A single robbery in a 24-covered-month window must read as ~0.5/yr (the # long-run pooled rate), NOT 12/yr (the old years-with-incidents mean that # inflated sporadic categories by up to ~15x). units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" for year in (2023, 2024): for month in range(1, 13): rows = [] if (year, month) == (2023, 6): rows = [_crime_row(f"{year}-{month:02d}", 1005, 1005, "Robbery")] _write_month(crime, f"{year}-{month:02d}", rows) avg_df, by_year_df = _run(tmp_path, crime, units) avg = avg_df.row(0, named=True) # 1 incident over 24 covered months -> 0.5/yr. assert avg["Robbery (avg/yr)"] == pytest.approx(0.5, abs=0.05) # The by-year bar still shows the 2023 incident annualised over 12 covered # months (1/yr); 2024 is covered with zero robberies -> no bar, but the # year IS in the coverage list so consumers may render it as a true zero. row = by_year_df.row(0, named=True) bars = {p["year"]: p["count"] for p in row["Robbery (by year)"]} assert bars == {2023: pytest.approx(1.0, abs=0.05)} coverage = {c["year"]: c["months"] for c in row["covered_years"]} assert coverage == {2023: 12, 2024: 12} def test_force_gap_years_are_excluded_not_zeroed(tmp_path): # Two postcodes policed by different forces. force-a publishes 2023+2024; # force-b publishes only 2023 (a 2024 gap, like Greater Manchester). The # b-postcode's headline must pool over force-b's 12 covered months only, # and its by-year series must NOT contain a 2024 bar or coverage entry. units = tmp_path / "units" _write_boundaries( units, { "AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)], "CD1": [_square_feature("CD1 1AA", 9000, 9000, 9010, 9010)], }, ) crime = tmp_path / "crime" for month in range(1, 13): ym23 = f"2023-{month:02d}" ym24 = f"2024-{month:02d}" # force-a covers AB1 in both years; one burglary per month in 2024. _write_month(crime, ym23, [], force="force-a") _write_month( crime, ym24, [_crime_row(ym24, 1005, 1005, "Burglary")], force="force-a" ) # force-b covers CD1 in 2023 only: one burglary per month. _write_month( crime, ym23, [_crime_row(ym23, 9005, 9005, "Burglary")], force="force-b" ) avg_df, by_year_df = _run(tmp_path, crime, units) rows = {r["postcode"]: r for r in avg_df.to_dicts()} # force-a postcode: 12 burglaries over 24 covered months -> 6/yr. assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(6.0, abs=0.05) # force-b postcode: 12 burglaries over 12 covered months -> 12/yr. Under # the old global calendar this would have been diluted to 6/yr by the # uncovered 2024. assert rows["CD1 1AA"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05) by_rows = {r["postcode"]: r for r in by_year_df.to_dicts()} b_coverage = {c["year"]: c["months"] for c in by_rows["CD1 1AA"]["covered_years"]} assert b_coverage == {2023: 12} b_bars = {p["year"]: p["count"] for p in by_rows["CD1 1AA"]["Burglary (by year)"]} assert set(b_bars) == {2023} a_coverage = {c["year"]: c["months"] for c in by_rows["AB1 1AA"]["covered_years"]} assert a_coverage == {2023: 12, 2024: 12} def test_residue_incidents_in_uncovered_years_are_excluded(tmp_path): # force-b stops publishing after 2023, but a force-a file contains a 2024 # incident that falls inside the b-postcode's buffer (cross-border residue, # the Greater Manchester pattern). That incident must not produce a 2024 # bar for the b-postcode, nor leak into its pooled headline. units = tmp_path / "units" _write_boundaries( units, { "AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)], "CD1": [_square_feature("CD1 1AA", 9000, 9000, 9010, 9010)], }, ) crime = tmp_path / "crime" for month in range(1, 13): ym23 = f"2023-{month:02d}" ym24 = f"2024-{month:02d}" _write_month(crime, ym23, [], force="force-a") # b's own 2023 incidents establish force-b as its home force. _write_month( crime, ym23, [_crime_row(ym23, 9005, 9005, "Burglary")] if month <= 6 else [], force="force-b", ) # 2024: only force-a publishes; one of its incidents lands in CD1 1AA. _write_month( crime, ym24, [_crime_row(ym24, 9005, 9005, "Burglary")] if month == 1 else [], force="force-a", ) avg_df, by_year_df = _run(tmp_path, crime, units) b_row = avg_df.filter(pl.col("postcode") == "CD1 1AA").row(0, named=True) # Pooled over force-b's 12 covered months (2023): 6 incidents -> 6/yr. # The residue 2024 incident is excluded (force-b published 0 months in 2024). assert b_row["Burglary (avg/yr)"] == pytest.approx(6.0, abs=0.05) b_by = by_year_df.filter(pl.col("postcode") == "CD1 1AA").row(0, named=True) bars = {p["year"]: p["count"] for p in b_by["Burglary (by year)"]} assert set(bars) == {2023} coverage = {c["year"]: c["months"] for c in b_by["covered_years"]} assert coverage == {2023: 12} def test_partial_years_below_min_bar_months_get_no_bar(tmp_path): # 2023 fully covered; 2024 has only 2 published months. With the default # 6-month minimum, 2024 must produce neither a bar (annualising x6 charts # noise) nor a coverage entry -- but its incidents and months still count # toward the pooled headline. units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" for month in range(1, 13): ym = f"2023-{month:02d}" _write_month(crime, ym, [_crime_row(ym, 1005, 1005, "Burglary")]) for month in (1, 2): ym = f"2024-{month:02d}" _write_month(crime, ym, [_crime_row(ym, 1005, 1005, "Burglary")]) avg_df, by_year_df = _run(tmp_path, crime, units) # Pooled: 14 incidents over 14 covered months -> 12/yr. assert avg_df.row(0, named=True)["Burglary (avg/yr)"] == pytest.approx( 12.0, abs=0.05 ) row = by_year_df.row(0, named=True) bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]} assert set(bars) == {2023} coverage = {c["year"]: c["months"] for c in row["covered_years"]} assert coverage == {2023: 12} def test_by_year_output_is_dense_with_coverage(tmp_path): # A postcode with zero incidents still gets a by-year row carrying its # coverage calendar, so "covered and crime-free" is distinguishable from # "no data" downstream. units = tmp_path / "units" _write_boundaries( units, { "AB1": [ _square_feature("AB1 1AA", 1000, 1000, 1010, 1010), _square_feature("AB1 1AB", 5000, 5000, 5010, 5010), ] }, ) crime = tmp_path / "crime" _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")]) avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1) assert by_year_df.height == 2 quiet = by_year_df.filter(pl.col("postcode") == "AB1 1AB").row(0, named=True) assert quiet["Burglary (by year)"] is None assert [c["year"] for c in quiet["covered_years"]] == [2024] # And the headline for the quiet postcode is a genuine 0, not null. quiet_avg = avg_df.filter(pl.col("postcode") == "AB1 1AB").row(0, named=True) assert quiet_avg["Burglary (avg/yr)"] == 0.0 def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path): # Burglary only in 2014, Robbery only in 2024 (one incident each, 2 covered # months total). Components pool over the same covered window (each # 1 x 12 / 2 = 6/yr) and the rollup equals their sum. units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" _write_month(crime, "2014-01", [_crime_row("2014-01", 1005, 1005, "Burglary")]) _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Robbery")]) avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1) avg = avg_df.row(0, named=True) assert avg["Burglary (avg/yr)"] == pytest.approx(6.0, abs=0.05) assert avg["Robbery (avg/yr)"] == pytest.approx(6.0, abs=0.05) # Rollup == sum of its component (avg/yr) columns. assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05) assert avg["Serious crime (avg/yr)"] == pytest.approx( avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05 ) # The by-year rollup series remains the per-year sum of the component bars. serious_bars = { p["year"]: p["count"] for p in by_year_df.row(0, named=True)["Serious crime (by year)"] } assert serious_bars == { 2014: pytest.approx(12.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05), } def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys): units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" _write_month( crime, "2024-01", [ _crime_row("2024-01", 1005, 1005, "Burglary"), _crime_row("2024-01", 1005, 1005, "Cyber fraud"), ], ) avg_df, _ = _run(tmp_path, crime, units) columns = avg_df.columns # The unknown type is dropped (no column for it) but a warning is emitted. assert "Cyber fraud (avg/yr)" not in columns assert "Burglary (avg/yr)" in columns err = capsys.readouterr().err assert "Cyber fraud" in err assert "WARNING" in err def test_legacy_crime_types_are_mapped(tmp_path): """Pre-2014 crime-type names are aliased to current equivalents in the spatial transform instead of being dropped as unknown types.""" units = tmp_path / "units" _write_boundaries( units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]} ) crime = tmp_path / "crime" _write_month( crime, "2013-01", [ _crime_row("2013-01", 1005, 1005, "Violent crime"), _crime_row("2013-01", 1005, 1005, "Public disorder and weapons"), ], ) avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1) row = avg_df.to_dicts()[0] # Single postcode -> area-norm factor 1.0; single covered month -> x12. assert row["Violence and sexual offences (avg/yr)"] == 12.0 assert row["Public order (avg/yr)"] == 12.0 by_year_row = by_year_df.row(0, named=True) assert by_year_row["Violence and sexual offences (by year)"] == [ {"year": 2013, "count": 12.0} ] assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]