Fable findings in data

2026-06-11 07:49:23 +01:00 · 2026-06-11 07:49:23 +01:00 · 6a33b03fdf
commit 6a33b03fdf
parent b98bc6d611
20 changed files with 1502 additions and 274 deletions
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -15,15 +15,42 @@ crime *density* rather than how much ground the buffer sweeps (a median-sized
 catchment is left unchanged; a large rural postcode is no longer inflated simply
 for covering more of the map). Normalising by the buffered area -- the region
 that actually collects points -- rather than the raw polygon keeps tiny unit
-postcodes from being over-inflated by the fixed buffer-ring floor. The headline
-``"{type} (avg/yr)"`` is the simple mean of the per-year annualised counts, so it
-equals the average of the by-year chart bars.
+postcodes from being over-inflated by the fixed buffer-ring floor. NOTE: this is
+an incident *density of the surrounding streets*, not a per-resident risk --
+zero-resident commercial centres (Soho, retail parks) legitimately rank high.
+
+**Force-coverage calendar.** police.uk has multi-year publication gaps for whole
+forces (Greater Manchester has published nothing between 2019-07 and the present
+except 2022-08; BTP, Gloucestershire, Devon & Cornwall and others have shorter
+gaps). A missing month is *no data*, not zero crime, so every figure here is
+computed against the months the postcode's own force actually published:
+
+* Each postcode is assigned a home force by majority vote of the incidents that
+  matched it (BTP, which reports nationwide, is excluded from the vote);
+  postcodes with no incidents inherit their outcode's majority force, then the
+  national modal force.
+* The headline ``"{type} (avg/yr)"`` is the POOLED annualised rate over the
+  force's covered months: ``sum(counts in covered years) * 12 / covered_months``.
+  Years in which the force published nothing contribute neither incidents nor
+  months, so a coverage gap no longer reads as a low-crime period. (Pooling over
+  covered months also fixes the old "divide by years-with-incidents" headline,
+  which inflated sporadic categories by up to ~15x.)
+* The by-year series only emits bars for years with at least
+  ``min_bar_months`` covered months (default 6): annualising a single observed
+  month x12 produced misleading spikes. Each bar is scaled by the force's
+  covered months in that year, not the global month calendar.
+* ``covered_years`` (list[struct{year, months}]) is written for every postcode
+  so the server can tell "covered, zero crime" (year listed, no bar) from "no
+  data" (year absent) instead of charting gaps as zeros.
+* Postcodes whose boundary buffer is unusable (broken geometry) get null
+  headline columns and an empty ``covered_years`` -- unknown, not zero.

 Outputs mirror the old LSOA transform's shape but are keyed on ``postcode``:

 * ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
-* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
-  nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.
+* ``crime_by_postcode_by_year.parquet`` -- one row per postcode: ``postcode`` +
+  ``covered_years`` + nested ``"{type} (by year)"`` ``list[struct{year, count}]``
+  columns, with Serious/Minor rollups.

 Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
 points", not true locations, and a share of rows have no coordinate at all
@ -56,6 +83,22 @@ ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES

 DEFAULT_BUFFER_M = 100.0
 MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
+STREET_CSV_NAME_RE = re.compile(r"^(\d{4}-\d{2})-(.+)-street\.csv$")
+
+# Minimum covered months for a year to get a by-year chart bar (and to be
+# listed in `covered_years`). Annualising fewer observed months (x12 from a
+# single month at the worst) produces bars dominated by noise, and the first
+# (2010: one month) and current partial year would otherwise always chart as
+# spikes/dips. Six months keeps the annualisation factor <= 2.
+MIN_BAR_MONTHS = 6
+
+# Forces that report nationwide rather than policing a territory. They never
+# define a postcode's home force (their publication calendar says nothing about
+# whether the *territorial* force covering the postcode published), but their
+# incidents still count toward whichever postcodes they fall in.
+NON_TERRITORIAL_FORCES = frozenset({"btp"})
+
+COVERAGE_COLUMN = "covered_years"

 # Generous GB bounds; points outside fall in no English postcode anyway, but
 # filtering first keeps the WGS84->BNG transform out of its undefined region.
@ -67,27 +110,51 @@ LAT_BOUNDS = (49.0, 61.5)
 _CSV_BATCH = 64


-def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
-    """Derive annualisation denominators from the monthly directory names.
+def _force_calendar(
+    csvs: list[Path],
+) -> tuple[list[int], list[str], np.ndarray]:
+    """Derive the per-force publication calendar from the CSV paths.

-    Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
-    month's incidents, so the set of month directories is the set of observed
-    months. Returns the sorted distinct years, months-observed-per-year, and the
-    total month count (the avg/yr denominator).
+    Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/{YYYY-MM}-{force}-
+    street.csv`` and holds that force's incidents for that month, so file
+    presence IS the coverage signal: a (force, month) with no file published
+    nothing. Returns the sorted distinct years, the force slugs (sorted), and
+    ``months_in_year_force`` of shape (n_forces, n_years) -- how many months
+    each force published in each year.
    """
-    months = sorted(
-        {path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
-    )
-    if not months:
-        raise ValueError("No valid YYYY-MM month directories found among crime CSVs")
+    month_force: set[tuple[str, str]] = set()
+    for path in csvs:
+        if not MONTH_DIR_RE.fullmatch(path.parent.name):
+            continue
+        m = STREET_CSV_NAME_RE.fullmatch(path.name)
+        if m is None or m.group(1) != path.parent.name:
+            continue
+        month_force.add((m.group(1), m.group(2)))
+    if not month_force:
+        raise ValueError("No valid YYYY-MM street crime CSVs found")

-    months_in_year: dict[int, int] = {}
-    for month in months:
-        year = int(month[:4])
-        months_in_year[year] = months_in_year.get(year, 0) + 1
+    years = sorted({int(month[:4]) for month, _ in month_force})
+    forces = sorted({force for _, force in month_force})
+    year_to_idx = {year: idx for idx, year in enumerate(years)}
+    force_to_idx = {force: idx for idx, force in enumerate(forces)}

-    years = sorted(months_in_year)
-    return years, months_in_year, len(months)
+    months_in_year_force = np.zeros((len(forces), len(years)), dtype=np.int32)
+    for month, force in month_force:
+        months_in_year_force[force_to_idx[force], year_to_idx[int(month[:4])]] += 1
+
+    # Surface coverage gaps loudly: any territorial force missing months inside
+    # the global publication window is exactly the data hole the coverage
+    # masking exists for.
+    all_months = {month for month, _ in month_force}
+    for force in forces:
+        published = {m for m, f in month_force if f == force}
+        missing = len(all_months) - len(published)
+        if missing:
+            print(
+                f"  coverage gap: {force} missing {missing}/{len(all_months)} months"
+            )
+
+    return years, forces, months_in_year_force


 def _build_tree(
@ -111,10 +178,17 @@ def _accumulate_counts(
    tree: shapely.STRtree,
    type_to_idx: dict[str, int],
    year_to_idx: dict[int, int],
+    force_to_idx: dict[str, int],
    transformer: Transformer,
    counts: np.ndarray,
+    force_votes: np.ndarray,
 ) -> None:
-    """Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
+    """Stream the crime CSVs, counting points-in-buffer per (postcode, type, year).
+
+    Also accumulates ``force_votes`` (n_postcodes, n_forces): how many matched
+    incidents each force's files contributed to each postcode, which later
+    elects the postcode's home force for the coverage calendar.
+    """
    schema = {
        "Longitude": pl.Float64,
        "Latitude": pl.Float64,
@ -129,13 +203,22 @@ def _accumulate_counts(

    for start in range(0, len(csvs), _CSV_BATCH):
        batch = csvs[start : start + _CSV_BATCH]
+        # The source file identifies the publishing force (police.uk has no
+        # force column with consistent naming); map each path back to its
+        # force index for the home-force vote.
+        path_to_fidx = {}
+        for path in batch:
+            m = STREET_CSV_NAME_RE.fullmatch(path.name)
+            if m is not None and m.group(2) in force_to_idx:
+                path_to_fidx[str(path)] = force_to_idx[m.group(2)]
        frame = (
            pl.scan_csv(
                batch,
                schema_overrides=schema,
                ignore_errors=True,
+                include_file_paths="_source_path",
            )
-            .select("Longitude", "Latitude", "Month", "Crime type")
+            .select("Longitude", "Latitude", "Month", "Crime type", "_source_path")
            # strict=False: a single malformed Month drops only that row instead
            # of aborting the whole build (a non-numeric year becomes null and is
            # filtered out by the year membership check below).
@ -166,8 +249,11 @@ def _accumulate_counts(
                pl.col("year")
                .replace_strict(year_to_idx, return_dtype=pl.Int32)
                .alias("yidx"),
+                pl.col("_source_path")
+                .replace_strict(path_to_fidx, default=-1, return_dtype=pl.Int32)
+                .alias("fidx"),
            )
-            .select("Longitude", "Latitude", "Crime type", "tidx", "yidx")
+            .select("Longitude", "Latitude", "Crime type", "tidx", "yidx", "fidx")
            .collect(engine="streaming")
        )

@ -186,13 +272,20 @@ def _accumulate_counts(
        lat = frame["Latitude"].to_numpy()
        tidx = frame["tidx"].to_numpy()
        yidx = frame["yidx"].to_numpy()
+        fidx = frame["fidx"].to_numpy()

        x, y = transformer.transform(lon, lat)
        finite = np.isfinite(x) & np.isfinite(y)
        total_dropped += int((~finite).sum())
        if not finite.any():
            continue
-        x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
+        x, y, tidx, yidx, fidx = (
+            x[finite],
+            y[finite],
+            tidx[finite],
+            yidx[finite],
+            fidx[finite],
+        )
        total_points += x.size

        points = shapely.points(x, y)
@ -203,6 +296,14 @@ def _accumulate_counts(
                (postcode_index, tidx[point_index], yidx[point_index]),
                1,
            )
+            matched_fidx = fidx[point_index]
+            known_force = matched_fidx >= 0
+            if known_force.any():
+                np.add.at(
+                    force_votes,
+                    (postcode_index[known_force], matched_fidx[known_force]),
+                    1,
+                )
            total_matches += point_index.size

        print(
@ -228,6 +329,56 @@ def _accumulate_counts(
        )


+def _assign_home_force(
+    postcodes: np.ndarray,
+    force_votes: np.ndarray,
+    forces: list[str],
+) -> np.ndarray:
+    """Elect each postcode's home (territorial) force.
+
+    Majority vote of matched incidents per publishing force; non-territorial
+    forces (BTP) are excluded from the vote because their calendar says nothing
+    about local coverage. Postcodes with no votes (no incidents ever, or
+    BTP-only) inherit the majority force of their outcode, then the national
+    modal force, so every postcode gets a coverage calendar.
+    """
+    votes = force_votes.astype(np.int64, copy=True)
+    for idx, force in enumerate(forces):
+        if force in NON_TERRITORIAL_FORCES:
+            votes[:, idx] = 0
+
+    home = votes.argmax(axis=1).astype(np.int32)
+    has_vote = votes.max(axis=1) > 0
+    home[~has_vote] = -1
+
+    if not has_vote.any():
+        raise ValueError("No incidents matched any postcode; cannot assign forces")
+
+    # Outcode-majority fallback for postcodes with no (territorial) incidents.
+    outcodes = np.array([pc.split(" ")[0] for pc in postcodes], dtype=object)
+    national_modal = int(
+        np.bincount(home[has_vote], minlength=len(forces)).argmax()
+    )
+    if (~has_vote).any():
+        outcode_modal: dict[str, int] = {}
+        voted_outcodes = outcodes[has_vote]
+        voted_home = home[has_vote]
+        for oc in np.unique(voted_outcodes):
+            counts = np.bincount(voted_home[voted_outcodes == oc], minlength=len(forces))
+            outcode_modal[oc] = int(counts.argmax())
+        fallback = np.array(
+            [outcode_modal.get(oc, national_modal) for oc in outcodes[~has_vote]],
+            dtype=np.int32,
+        )
+        home[~has_vote] = fallback
+        print(
+            f"  {int((~has_vote).sum()):,} postcodes had no territorial incidents; "
+            "home force inherited from outcode majority"
+        )
+
+    return home
+
+
 def _rollup_long(
    long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
 ) -> pl.DataFrame:
@ -244,30 +395,41 @@ def _rollup_long(
 def _write_avg_yr(
    postcodes: np.ndarray,
    counts: np.ndarray,
-    years: list[int],
-    months_in_year: dict[int, int],
+    months_in_year_force: np.ndarray,
+    home_fidx: np.ndarray,
    norm: np.ndarray,
    output_path: Path,
 ) -> None:
    """Write ``postcode`` + ``"{type} (avg/yr)"`` density-normalised averages.

-    The headline figure is the **simple mean of the per-year annualised counts**
-    (each year scaled to a 12-month equivalent), so it equals the average of the
-    by-year chart bars instead of a month-weighted pooled rate. Each postcode's
-    value is then multiplied by ``norm`` (median_area / buffered catchment area)
-    so the metric is a density rather than a footprint-inflated raw count.
+    The headline is the POOLED annualised rate over the home force's covered
+    months: ``sum(counts in covered years) * 12 / covered_months``. Years the
+    force published nothing contribute neither incidents nor months, so a
+    coverage gap (e.g. Greater Manchester 2019-07 onwards) is excluded instead
+    of read as zero crime. Pooling over the full covered window -- rather than
+    averaging only over years a type happened to occur -- is what keeps a
+    single robbery-year from printing as a perennial robbery rate. Each
+    postcode's value is then multiplied by ``norm`` (median_area / buffered
+    catchment area) so the metric is a density rather than a footprint-inflated
+    raw count; postcodes with unusable geometry (norm == 0) are null, not 0.
    """
-    months = np.array([months_in_year[year] for year in years], dtype=np.float64)
-    per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
-    # Average over the years *this postcode* actually has incidents of *this
-    # type* -- the same per-(postcode, type) x-span the by-year chart plots
-    # (server-rs/.../crime_by_year.rs), so the headline equals the mean of the
-    # by-year bars. Dividing by a global years-present count (years a type
-    # appeared anywhere in England) would deflate postcodes whose incidents
-    # cluster in only a few years of the ~13-year window.
-    years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64)
-    avg = per_year.sum(axis=2) / years_present  # (n_postcodes, n_types)
-    avg = np.round(avg * norm[:, None], 1).astype(np.float32)
+    n_postcodes, n_types = counts.shape[0], counts.shape[1]
+    avg = np.full((n_postcodes, n_types), np.nan, dtype=np.float64)
+    for f in range(months_in_year_force.shape[0]):
+        sel = home_fidx == f
+        if not sel.any():
+            continue
+        cov_months = months_in_year_force[f].astype(np.float64)
+        denom = cov_months.sum()
+        if denom <= 0:
+            continue  # force never published; stays null
+        covered_years = cov_months > 0
+        pooled = counts[sel][:, :, covered_years].sum(axis=2, dtype=np.float64)
+        avg[sel] = pooled * 12.0 / denom
+
+    avg *= norm[:, None]
+    avg[norm <= 0] = np.nan  # unusable geometry: unknown, not zero
+    avg = np.round(avg, 1).astype(np.float32)

    data: dict[str, np.ndarray] = {"postcode": postcodes}
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
@ -275,14 +437,10 @@ def _write_avg_yr(

    # Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
    # columns, so each rollup always equals the sum of the parts shown beside it
-    # and can never fall below one of its own components. (Previously the rollup
-    # re-derived a union-years-present mean: it divided the summed counts by the
-    # number of years in which ANY component type occurred, whereas each
-    # component divides by its OWN years-present. When a postcode's serious/minor
-    # types occurred in disjoint years the union denominator was larger, so the
-    # rollup came out smaller than the sum of its parts.) The by-year rollup
-    # series in _write_by_year is likewise the per-year sum of the component
-    # bars, so headline and chart both present the rollup as the sum of its parts.
+    # and can never fall below one of its own components. All components share
+    # the postcode's pooled covered-month denominator, so the sum is itself the
+    # pooled rollup rate. Null components (unusable geometry) propagate to a
+    # null rollup.
    for rollup_name, rollup_types in (
        ("Serious crime", SERIOUS_CRIME_TYPES),
        ("Minor crime", MINOR_CRIME_TYPES),
@ -292,8 +450,12 @@ def _write_avg_yr(
            avg[:, rollup_idx].sum(axis=1), 1
        ).astype(np.float32)

+    frame = pl.DataFrame(data)
+    value_cols = [c for c in frame.columns if c != "postcode"]
+    frame = frame.with_columns(pl.col(c).fill_nan(None) for c in value_cols)
+
    output_path.parent.mkdir(parents=True, exist_ok=True)
-    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
+    frame.write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime averages: {output_path}")


@ -301,35 +463,60 @@ def _write_by_year(
    postcodes: np.ndarray,
    counts: np.ndarray,
    years: list[int],
-    months_in_year: dict[int, int],
+    months_in_year_force: np.ndarray,
+    home_fidx: np.ndarray,
    norm: np.ndarray,
+    min_bar_months: int,
    output_path: Path,
 ) -> None:
-    """Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups.
+    """Write nested ``"{type} (by year)"`` series plus rollups and coverage.

-    Per-year counts are area-normalised by the same ``norm`` (median_area /
-    buffered catchment area) factor applied to the avg/yr headline, so the chart
-    bars and the headline figure remain mutually consistent.
+    A bar is only emitted for (postcode, year)s where the postcode's home force
+    published at least ``min_bar_months`` months -- annualising a thinner year
+    (x12 from a single month at the extreme) charts noise, and a force-gap year
+    must chart as *no data*, not zero. Bars are scaled by the force's covered
+    months in that year and area-normalised by the same ``norm`` factor as the
+    headline so chart and headline stay mutually consistent.
+
+    Every postcode gets a row (the output is dense) carrying ``covered_years``
+    -- the list of {year, months} the home force published at least
+    ``min_bar_months`` months -- so consumers can distinguish covered-but-
+    crime-free years (year listed, no bar => genuine zero) from coverage gaps
+    (year absent => unknown). Postcodes with unusable geometry get an empty
+    coverage list: their crime picture is unknown.
    """
-    months = np.array([months_in_year[year] for year in years], dtype=np.float64)
+    # (n_postcodes, n_years): covered months of each postcode's home force.
+    cov_pc_year = months_in_year_force[home_fidx, :]
+    usable = norm > 0
+
    annual = np.round(
-        counts.astype(np.float64) * 12.0 / months[None, None, :] * norm[:, None, None],
+        counts.astype(np.float64)
+        * 12.0
+        / np.maximum(cov_pc_year[:, None, :], 1)
+        * norm[:, None, None],
        1,
    )
+    bar_ok = (
+        (counts > 0)
+        & (cov_pc_year[:, None, :] >= min_bar_months)
+        & usable[:, None, None]
+    )

-    pc_i, ty_i, yr_i = np.nonzero(counts)
-    if pc_i.size == 0:
-        raise ValueError("No crime points matched any postcode buffer")
+    pc_i, ty_i, yr_i = np.nonzero(bar_ok)

    type_names = np.array(ALL_CRIME_TYPES, dtype=object)
    year_values = np.array(years, dtype=np.int32)
+    # Explicit schema: with full masking (e.g. every year below min_bar_months)
+    # the fancy-indexed numpy object arrays are empty and polars would infer
+    # Object columns, which breaks the rollup `is_in` below.
    long = pl.DataFrame(
        {
-            "postcode": postcodes[pc_i],
-            "Crime type": type_names[ty_i],
+            "postcode": postcodes[pc_i].astype(str),
+            "Crime type": type_names[ty_i].astype(str),
            "year": year_values[yr_i],
            "count": annual[pc_i, ty_i, yr_i].astype(np.float32),
-        }
+        },
+        schema_overrides={"postcode": pl.String, "Crime type": pl.String},
    )

    serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
@ -345,6 +532,46 @@ def _write_by_year(
    type_cols = [c for c in wide.columns if c != "postcode"]
    wide = wide.rename({col: f"{col} (by year)" for col in type_cols})

+    # Dense base: every postcode, with its home force's coverage calendar.
+    # Built per force (there are ~45) and joined on the force index.
+    coverage_per_force: list[list[dict[str, int]]] = []
+    for f in range(months_in_year_force.shape[0]):
+        coverage_per_force.append(
+            [
+                {"year": int(years[y]), "months": int(m)}
+                for y, m in enumerate(months_in_year_force[f])
+                if m >= min_bar_months
+            ]
+        )
+    coverage_frame = pl.DataFrame(
+        {
+            "_fidx": pl.Series(range(len(coverage_per_force)), dtype=pl.Int32),
+            COVERAGE_COLUMN: pl.Series(
+                coverage_per_force,
+                dtype=pl.List(pl.Struct({"year": pl.Int32, "months": pl.Int32})),
+            ),
+        }
+    )
+    base = pl.DataFrame(
+        {
+            "postcode": postcodes,
+            "_fidx": pl.Series(home_fidx, dtype=pl.Int32),
+            "_usable": pl.Series(usable),
+        }
+    )
+    dense = (
+        base.join(coverage_frame, on="_fidx", how="left")
+        .with_columns(
+            # Unusable geometry: empty coverage -- the crime picture is unknown.
+            pl.when(pl.col("_usable"))
+            .then(pl.col(COVERAGE_COLUMN))
+            .otherwise(pl.col(COVERAGE_COLUMN).list.head(0))
+            .alias(COVERAGE_COLUMN)
+        )
+        .drop("_fidx", "_usable")
+    )
+    wide = dense.join(wide, on="postcode", how="left")
+
    output_path.parent.mkdir(parents=True, exist_ok=True)
    wide.write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime by-year series: {output_path}  {wide.shape}")
@ -358,6 +585,7 @@ def transform_crime_spatial(
    buffer_m: float = DEFAULT_BUFFER_M,
    max_postcodes: int | None = None,
    max_files: int | None = None,
+    min_bar_months: int = MIN_BAR_MONTHS,
 ) -> None:
    csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
    if not csvs:
@ -365,9 +593,9 @@ def transform_crime_spatial(
    if max_files is not None:
        csvs = csvs[:max_files]

-    years, months_in_year, valid_month_count = _month_calendar(csvs)
+    years, forces, months_in_year_force = _force_calendar(csvs)
    print(
-        f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
+        f"Found {len(csvs):,} street crime CSVs across {len(forces)} forces "
        f"({years[0]}-{years[-1]})"
        + (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
    )
@ -397,18 +625,35 @@ def transform_crime_spatial(

    type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
    year_to_idx = {year: idx for idx, year in enumerate(years)}
+    force_to_idx = {force: idx for idx, force in enumerate(forces)}
    counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)
+    force_votes = np.zeros((len(postcodes), len(forces)), dtype=np.int32)

    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
-    _accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
+    _accumulate_counts(
+        csvs, tree, type_to_idx, year_to_idx, force_to_idx, transformer, counts, force_votes
+    )

-    _write_avg_yr(postcodes, counts, years, months_in_year, norm, output_path)
-    _write_by_year(postcodes, counts, years, months_in_year, norm, by_year_output_path)
+    home_fidx = _assign_home_force(np.asarray(postcodes), force_votes, forces)
+
+    _write_avg_yr(
+        postcodes, counts, months_in_year_force, home_fidx, norm, output_path
+    )
+    _write_by_year(
+        postcodes,
+        counts,
+        years,
+        months_in_year_force,
+        home_fidx,
+        norm,
+        min_bar_months,
+        by_year_output_path,
+    )


 def main() -> None:
    parser = argparse.ArgumentParser(
-        description="Count police.uk crime points within 50m of each postcode boundary"
+        description="Count police.uk crime points near each postcode boundary"
    )
    parser.add_argument(
        "--input",
@ -452,6 +697,12 @@ def main() -> None:
        default=None,
        help="Testing only: process the first N monthly CSV files",
    )
+    parser.add_argument(
+        "--min-bar-months",
+        type=int,
+        default=MIN_BAR_MONTHS,
+        help="Minimum covered months for a year to get a by-year bar",
+    )
    args = parser.parse_args()

    if args.buffer_m <= 0:
@ -465,6 +716,7 @@ def main() -> None:
        buffer_m=args.buffer_m,
        max_postcodes=args.max_postcodes,
        max_files=args.max_files,
+        min_bar_months=args.min_bar_months,
    )