Fable findings in data

2026-06-11 07:49:23 +01:00 · 2026-06-11 07:49:23 +01:00 · 6a33b03fdf
commit 6a33b03fdf
parent b98bc6d611
20 changed files with 1502 additions and 274 deletions
--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -47,11 +47,22 @@ def _crime_row(month: str, x, y, crime_type: str) -> str:
    return f",{month},F,F,{lon},{lat},On or near X,E01000001,L,{crime_type},U,"


-def _write_month(crime_dir, month: str, rows: list[str]) -> None:
+def _write_month(
+    crime_dir, month: str, rows: list[str], force: str = "test-force"
+) -> None:
+    """Write one force's monthly CSV; an empty ``rows`` list still creates the
+    file, which counts as published coverage for that (force, month)."""
    month_dir = crime_dir / month
-    month_dir.mkdir(parents=True)
+    month_dir.mkdir(parents=True, exist_ok=True)
    body = "\n".join([_CSV_HEADER, *rows]) + "\n"
-    (month_dir / f"{month}-test-force-street.csv").write_text(body)
+    (month_dir / f"{month}-{force}-street.csv").write_text(body)
+
+
+def _run(tmp_path, crime, units, **kwargs):
+    output = tmp_path / "crime_by_postcode.parquet"
+    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
+    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0, **kwargs)
+    return pl.read_parquet(output), pl.read_parquet(by_year)


 def test_buffer_overlap_counts_for_each_postcode(tmp_path):
@ -84,18 +95,9 @@ def test_buffer_overlap_counts_for_each_postcode(tmp_path):
        ],
    )

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    # Pin the 50m buffer the geometry above was designed around (the production
-    # default is now 100m). The three squares are equal-area, so area
-    # normalisation leaves the counts unchanged.
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
-
-    rows = {
-        r["postcode"]: r
-        for r in pl.read_parquet(output).to_dicts()
-    }
-    # Single month -> annualised x12.
+    avg_df, _ = _run(tmp_path, crime, units)
+    rows = {r["postcode"]: r for r in avg_df.to_dicts()}
+    # Single covered month -> pooled rate x12.
    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == 12.0
    assert rows["AB1 1AB"]["Burglary (avg/yr)"] == 12.0
    assert rows["AB1 1AA"]["Robbery (avg/yr)"] == 0.0
@ -132,18 +134,14 @@ def test_by_year_annualises_and_rolls_up(tmp_path):
        ],
    )

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
-
-    by_year_df = pl.read_parquet(by_year)
+    _, by_year_df = _run(tmp_path, crime, units, min_bar_months=1)
    assert by_year_df.height == 1
    cols = set(by_year_df.columns)
    assert {"Burglary (by year)", "Serious crime (by year)", "Minor crime (by year)"} <= cols

    row = by_year_df.row(0, named=True)
    burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
-    # 2023: 1 burglary in 1 month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
+    # 2023: 1 burglary in 1 covered month -> 12/yr; 2024: 2 in 2 months -> 12/yr.
    assert burglary == [
        {"year": 2023, "count": 12.0},
        {"year": 2024, "count": 12.0},
@ -152,6 +150,9 @@ def test_by_year_annualises_and_rolls_up(tmp_path):
    # 2023 serious = Burglary(12) + Robbery(12) = 24; 2024 = Burglary(12).
    assert serious[2023] == 24.0
    assert serious[2024] == 12.0
+    # Coverage calendar: both years published, with their month counts.
+    coverage = {c["year"]: c["months"] for c in row["covered_years"]}
+    assert coverage == {2023: 1, 2024: 2}


 def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
@ -184,9 +185,7 @@ def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
        ],
    )

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+    avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1)

    # Re-derive the expected values from the same buffered catchment areas: each
    # postcode is 12/yr before normalisation, then x (median_buf / buffered_area).
@ -198,7 +197,7 @@ def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
    median_buf = float(np.median(list(buf_area.values())))
    expected = {pc: 12.0 * median_buf / buf_area[pc] for pc in buf_area}

-    rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
+    rows = {r["postcode"]: r for r in avg_df.to_dicts()}
    for pc, exp in expected.items():
        assert rows[pc]["Burglary (avg/yr)"] == pytest.approx(exp, abs=0.1)

@ -211,18 +210,17 @@ def test_area_normalisation_divides_out_buffered_catchment(tmp_path):
    assert small / big < 1.5

    # by-year series carries the same normalisation.
-    by_year_df = pl.read_parquet(by_year)
    small_row = by_year_df.filter(pl.col("postcode") == "AB1 1AA").row(0, named=True)
    assert small_row["Burglary (by year)"] == [
        {"year": 2024, "count": pytest.approx(expected["AB1 1AA"], abs=0.1)}
    ]


-def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
-    # Uneven month coverage across years: 2023 has 1 month (2 incidents -> 24/yr),
-    # 2024 has 2 months (2 incidents -> 12/yr). The headline must be the *simple*
-    # mean of the bars (24+12)/2 = 18, not the month-weighted pooled rate
-    # (4 incidents / 3 months * 12 = 16).
+def test_avg_yr_is_pooled_rate_over_covered_months(tmp_path):
+    # Uneven month coverage across years: 2023 has 1 month (2 incidents),
+    # 2024 has 2 months (2 incidents). The headline is the POOLED annualised
+    # rate over all covered months: 4 incidents / 3 months * 12 = 16/yr -- not
+    # the old mean-of-bars (24+12)/2 = 18, which over-weighted thin years.
    units = tmp_path / "units"
    _write_boundaries(
        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
@ -240,68 +238,179 @@ def test_avg_yr_is_simple_mean_of_year_bars(tmp_path):
    _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
    _write_month(crime, "2024-02", [_crime_row("2024-02", 1005, 1005, "Burglary")])

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+    avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1)

-    avg = pl.read_parquet(output).row(0, named=True)
-    assert avg["Burglary (avg/yr)"] == pytest.approx(18.0, abs=0.05)
+    avg = avg_df.row(0, named=True)
+    assert avg["Burglary (avg/yr)"] == pytest.approx(16.0, abs=0.05)

-    row = pl.read_parquet(by_year).row(0, named=True)
+    # Bars remain per-year annualised: 2023 -> 24/yr (x12), 2024 -> 12/yr (x6).
+    row = by_year_df.row(0, named=True)
    bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]}
    assert bars == {2023: pytest.approx(24.0, abs=0.05), 2024: pytest.approx(12.0, abs=0.05)}


-def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
-    # Two SERIOUS types occur in DISJOINT years for one postcode: Burglary only in
-    # 2014, Robbery only in 2024 (each a single full month -> 12/yr). The headline
-    # "Serious crime (avg/yr)" must equal the SUM of its component (avg/yr) columns
-    # (Burglary 12 + Robbery 12 = 24), so the rollup is always the sum of the parts
-    # shown beside it and can never fall below a single component. (The previous
-    # union-years-present mean would have divided the per-year serious total by the
-    # 2 years any serious type occurred, giving a misleading 12 that sits below
-    # both the burglary and robbery rollup contributions.)
+def test_sporadic_type_is_not_inflated_by_years_present(tmp_path):
+    # A single robbery in a 24-covered-month window must read as ~0.5/yr (the
+    # long-run pooled rate), NOT 12/yr (the old years-with-incidents mean that
+    # inflated sporadic categories by up to ~15x).
    units = tmp_path / "units"
    _write_boundaries(
        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
    )

    crime = tmp_path / "crime"
-    _write_month(crime, "2014-01", [_crime_row("2014-01", 1005, 1005, "Burglary")])
-    _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Robbery")])
+    for year in (2023, 2024):
+        for month in range(1, 13):
+            rows = []
+            if (year, month) == (2023, 6):
+                rows = [_crime_row(f"{year}-{month:02d}", 1005, 1005, "Robbery")]
+            _write_month(crime, f"{year}-{month:02d}", rows)

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+    avg_df, by_year_df = _run(tmp_path, crime, units)

-    avg = pl.read_parquet(output).row(0, named=True)
-    assert "Serious crime (avg/yr)" in avg
-    assert avg["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
-    assert avg["Robbery (avg/yr)"] == pytest.approx(12.0, abs=0.05)
-    # Rollup == sum of its component (avg/yr) columns.
-    assert avg["Serious crime (avg/yr)"] == pytest.approx(24.0, abs=0.05)
-    assert avg["Serious crime (avg/yr)"] == pytest.approx(
-        avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
+    avg = avg_df.row(0, named=True)
+    # 1 incident over 24 covered months -> 0.5/yr.
+    assert avg["Robbery (avg/yr)"] == pytest.approx(0.5, abs=0.05)
+    # The by-year bar still shows the 2023 incident annualised over 12 covered
+    # months (1/yr); 2024 is covered with zero robberies -> no bar, but the
+    # year IS in the coverage list so consumers may render it as a true zero.
+    row = by_year_df.row(0, named=True)
+    bars = {p["year"]: p["count"] for p in row["Robbery (by year)"]}
+    assert bars == {2023: pytest.approx(1.0, abs=0.05)}
+    coverage = {c["year"]: c["months"] for c in row["covered_years"]}
+    assert coverage == {2023: 12, 2024: 12}
+
+
+def test_force_gap_years_are_excluded_not_zeroed(tmp_path):
+    # Two postcodes policed by different forces. force-a publishes 2023+2024;
+    # force-b publishes only 2023 (a 2024 gap, like Greater Manchester). The
+    # b-postcode's headline must pool over force-b's 12 covered months only,
+    # and its by-year series must NOT contain a 2024 bar or coverage entry.
+    units = tmp_path / "units"
+    _write_boundaries(
+        units,
+        {
+            "AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)],
+            "CD1": [_square_feature("CD1 1AA", 9000, 9000, 9010, 9010)],
+        },
    )

-    # The by-year rollup series remains the per-year sum of the component bars.
-    serious_bars = {
-        p["year"]: p["count"]
-        for p in pl.read_parquet(by_year).row(0, named=True)["Serious crime (by year)"]
-    }
-    assert serious_bars == {
-        2014: pytest.approx(12.0, abs=0.05),
-        2024: pytest.approx(12.0, abs=0.05),
-    }
+    crime = tmp_path / "crime"
+    for month in range(1, 13):
+        ym23 = f"2023-{month:02d}"
+        ym24 = f"2024-{month:02d}"
+        # force-a covers AB1 in both years; one burglary per month in 2024.
+        _write_month(crime, ym23, [], force="force-a")
+        _write_month(
+            crime, ym24, [_crime_row(ym24, 1005, 1005, "Burglary")], force="force-a"
+        )
+        # force-b covers CD1 in 2023 only: one burglary per month.
+        _write_month(
+            crime, ym23, [_crime_row(ym23, 9005, 9005, "Burglary")], force="force-b"
+        )
+
+    avg_df, by_year_df = _run(tmp_path, crime, units)
+    rows = {r["postcode"]: r for r in avg_df.to_dicts()}
+
+    # force-a postcode: 12 burglaries over 24 covered months -> 6/yr.
+    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(6.0, abs=0.05)
+    # force-b postcode: 12 burglaries over 12 covered months -> 12/yr. Under
+    # the old global calendar this would have been diluted to 6/yr by the
+    # uncovered 2024.
+    assert rows["CD1 1AA"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+
+    by_rows = {r["postcode"]: r for r in by_year_df.to_dicts()}
+    b_coverage = {c["year"]: c["months"] for c in by_rows["CD1 1AA"]["covered_years"]}
+    assert b_coverage == {2023: 12}
+    b_bars = {p["year"]: p["count"] for p in by_rows["CD1 1AA"]["Burglary (by year)"]}
+    assert set(b_bars) == {2023}
+    a_coverage = {c["year"]: c["months"] for c in by_rows["AB1 1AA"]["covered_years"]}
+    assert a_coverage == {2023: 12, 2024: 12}


-def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
-    # P (AB1 1AA) has burglaries only in its single most-recent year (2024); Q
-    # (AB1 1AB), far away, has a burglary in 2014. The type therefore spans TWO
-    # distinct years across all postcodes, but only ONE year for P. The headline
-    # must divide by P's own years-present (1), equalling its single by-year bar
-    # (24/yr) -- not by the global span (2), which would deflate it to 12/yr.
-    # The two squares are equal-area, so area normalisation leaves counts as-is.
+def test_residue_incidents_in_uncovered_years_are_excluded(tmp_path):
+    # force-b stops publishing after 2023, but a force-a file contains a 2024
+    # incident that falls inside the b-postcode's buffer (cross-border residue,
+    # the Greater Manchester pattern). That incident must not produce a 2024
+    # bar for the b-postcode, nor leak into its pooled headline.
+    units = tmp_path / "units"
+    _write_boundaries(
+        units,
+        {
+            "AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)],
+            "CD1": [_square_feature("CD1 1AA", 9000, 9000, 9010, 9010)],
+        },
+    )
+
+    crime = tmp_path / "crime"
+    for month in range(1, 13):
+        ym23 = f"2023-{month:02d}"
+        ym24 = f"2024-{month:02d}"
+        _write_month(crime, ym23, [], force="force-a")
+        # b's own 2023 incidents establish force-b as its home force.
+        _write_month(
+            crime,
+            ym23,
+            [_crime_row(ym23, 9005, 9005, "Burglary")] if month <= 6 else [],
+            force="force-b",
+        )
+        # 2024: only force-a publishes; one of its incidents lands in CD1 1AA.
+        _write_month(
+            crime,
+            ym24,
+            [_crime_row(ym24, 9005, 9005, "Burglary")] if month == 1 else [],
+            force="force-a",
+        )
+
+    avg_df, by_year_df = _run(tmp_path, crime, units)
+
+    b_row = avg_df.filter(pl.col("postcode") == "CD1 1AA").row(0, named=True)
+    # Pooled over force-b's 12 covered months (2023): 6 incidents -> 6/yr.
+    # The residue 2024 incident is excluded (force-b published 0 months in 2024).
+    assert b_row["Burglary (avg/yr)"] == pytest.approx(6.0, abs=0.05)
+
+    b_by = by_year_df.filter(pl.col("postcode") == "CD1 1AA").row(0, named=True)
+    bars = {p["year"]: p["count"] for p in b_by["Burglary (by year)"]}
+    assert set(bars) == {2023}
+    coverage = {c["year"]: c["months"] for c in b_by["covered_years"]}
+    assert coverage == {2023: 12}
+
+
+def test_partial_years_below_min_bar_months_get_no_bar(tmp_path):
+    # 2023 fully covered; 2024 has only 2 published months. With the default
+    # 6-month minimum, 2024 must produce neither a bar (annualising x6 charts
+    # noise) nor a coverage entry -- but its incidents and months still count
+    # toward the pooled headline.
+    units = tmp_path / "units"
+    _write_boundaries(
+        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
+    )
+
+    crime = tmp_path / "crime"
+    for month in range(1, 13):
+        ym = f"2023-{month:02d}"
+        _write_month(crime, ym, [_crime_row(ym, 1005, 1005, "Burglary")])
+    for month in (1, 2):
+        ym = f"2024-{month:02d}"
+        _write_month(crime, ym, [_crime_row(ym, 1005, 1005, "Burglary")])
+
+    avg_df, by_year_df = _run(tmp_path, crime, units)
+
+    # Pooled: 14 incidents over 14 covered months -> 12/yr.
+    assert avg_df.row(0, named=True)["Burglary (avg/yr)"] == pytest.approx(
+        12.0, abs=0.05
+    )
+    row = by_year_df.row(0, named=True)
+    bars = {p["year"]: p["count"] for p in row["Burglary (by year)"]}
+    assert set(bars) == {2023}
+    coverage = {c["year"]: c["months"] for c in row["covered_years"]}
+    assert coverage == {2023: 12}
+
+
+def test_by_year_output_is_dense_with_coverage(tmp_path):
+    # A postcode with zero incidents still gets a by-year row carrying its
+    # coverage calendar, so "covered and crime-free" is distinguishable from
+    # "no data" downstream.
    units = tmp_path / "units"
    _write_boundaries(
        units,
@ -314,42 +423,52 @@ def test_avg_yr_denominator_is_per_postcode_not_global(tmp_path):
    )

    crime = tmp_path / "crime"
-    # P: 2 burglaries in a single 2024 month -> 24/yr bar, present in 1 year.
-    _write_month(
-        crime,
-        "2024-01",
-        [
-            _crime_row("2024-01", 1005, 1005, "Burglary"),
-            _crime_row("2024-01", 1005, 1005, "Burglary"),
-        ],
+    _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Burglary")])
+
+    avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1)
+    assert by_year_df.height == 2
+
+    quiet = by_year_df.filter(pl.col("postcode") == "AB1 1AB").row(0, named=True)
+    assert quiet["Burglary (by year)"] is None
+    assert [c["year"] for c in quiet["covered_years"]] == [2024]
+    # And the headline for the quiet postcode is a genuine 0, not null.
+    quiet_avg = avg_df.filter(pl.col("postcode") == "AB1 1AB").row(0, named=True)
+    assert quiet_avg["Burglary (avg/yr)"] == 0.0
+
+
+def test_serious_rollup_avg_yr_equals_sum_of_components(tmp_path):
+    # Burglary only in 2014, Robbery only in 2024 (one incident each, 2 covered
+    # months total). Components pool over the same covered window (each
+    # 1 x 12 / 2 = 6/yr) and the rollup equals their sum.
+    units = tmp_path / "units"
+    _write_boundaries(
+        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
    )
-    # Q: 1 burglary in a far-back 2014 month -> widens the type's global span to
-    # two years without adding any incident to P.
-    _write_month(crime, "2014-01", [_crime_row("2014-01", 5005, 5005, "Burglary")])

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+    crime = tmp_path / "crime"
+    _write_month(crime, "2014-01", [_crime_row("2014-01", 1005, 1005, "Burglary")])
+    _write_month(crime, "2024-01", [_crime_row("2024-01", 1005, 1005, "Robbery")])

-    rows = {r["postcode"]: r for r in pl.read_parquet(output).to_dicts()}
-    by_year_rows = {
-        r["postcode"]: r for r in pl.read_parquet(by_year).to_dicts()
+    avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1)
+
+    avg = avg_df.row(0, named=True)
+    assert avg["Burglary (avg/yr)"] == pytest.approx(6.0, abs=0.05)
+    assert avg["Robbery (avg/yr)"] == pytest.approx(6.0, abs=0.05)
+    # Rollup == sum of its component (avg/yr) columns.
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(12.0, abs=0.05)
+    assert avg["Serious crime (avg/yr)"] == pytest.approx(
+        avg["Burglary (avg/yr)"] + avg["Robbery (avg/yr)"], abs=0.05
+    )
+
+    # The by-year rollup series remains the per-year sum of the component bars.
+    serious_bars = {
+        p["year"]: p["count"]
+        for p in by_year_df.row(0, named=True)["Serious crime (by year)"]
+    }
+    assert serious_bars == {
+        2014: pytest.approx(12.0, abs=0.05),
+        2024: pytest.approx(12.0, abs=0.05),
    }
-
-    # P's headline equals the simple mean of its own bars (just the 2024 bar).
-    p_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AA"]["Burglary (by year)"]}
-    assert p_bars == {2024: pytest.approx(24.0, abs=0.05)}
-    # Per-postcode denominator (1) -> 24.0. The old global denominator (2 years
-    # across all postcodes) would have deflated this to 12.0.
-    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(24.0, abs=0.05)
-    assert rows["AB1 1AA"]["Burglary (avg/yr)"] == pytest.approx(
-        sum(p_bars.values()) / len(p_bars), abs=0.05
-    )
-
-    # Q likewise: its sole 2014 bar -> 12/yr, divided by its own 1 year = 12.0.
-    q_bars = {p["year"]: p["count"] for p in by_year_rows["AB1 1AB"]["Burglary (by year)"]}
-    assert q_bars == {2014: pytest.approx(12.0, abs=0.05)}
-    assert rows["AB1 1AB"]["Burglary (avg/yr)"] == pytest.approx(12.0, abs=0.05)


 def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
@ -368,11 +487,8 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
        ],
    )

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
-
-    columns = pl.read_parquet(output).columns
+    avg_df, _ = _run(tmp_path, crime, units)
+    columns = avg_df.columns
    # The unknown type is dropped (no column for it) but a warning is emitted.
    assert "Cyber fraud (avg/yr)" not in columns
    assert "Burglary (avg/yr)" in columns
@ -399,16 +515,13 @@ def test_legacy_crime_types_are_mapped(tmp_path):
        ],
    )

-    output = tmp_path / "crime_by_postcode.parquet"
-    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
-    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
-
-    row = pl.read_parquet(output).to_dicts()[0]
-    # Single postcode -> area-norm factor 1.0; single month/year -> x12.
+    avg_df, by_year_df = _run(tmp_path, crime, units, min_bar_months=1)
+    row = avg_df.to_dicts()[0]
+    # Single postcode -> area-norm factor 1.0; single covered month -> x12.
    assert row["Violence and sexual offences (avg/yr)"] == 12.0
    assert row["Public order (avg/yr)"] == 12.0

-    by_year_row = pl.read_parquet(by_year).row(0, named=True)
+    by_year_row = by_year_df.row(0, named=True)
    assert by_year_row["Violence and sexual offences (by year)"] == [
        {"year": 2013, "count": 12.0}
    ]