has issues

2026-05-25 13:20:17 +01:00 · 2026-05-25 13:20:17 +01:00 · c645b0f1d4
commit c645b0f1d4
parent 2e112d7398
96 changed files with 2147083 additions and 5787 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -7,6 +7,27 @@ import polars as pl
 STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
 MONTH_RE = r"^\d{4}-\d{2}$"

+# Crime types that roll up into "Serious crime" / "Minor crime" aggregates.
+# Must match the names used in pipeline/transform/merge.py for the sum_horizontal expressions.
+SERIOUS_CRIME_TYPES = (
+    "Violence and sexual offences",
+    "Robbery",
+    "Burglary",
+    "Possession of weapons",
+)
+MINOR_CRIME_TYPES = (
+    "Anti-social behaviour",
+    "Criminal damage and arson",
+    "Shoplifting",
+    "Bicycle theft",
+    "Theft from the person",
+    "Other theft",
+    "Vehicle crime",
+    "Public order",
+    "Drugs",
+    "Other crime",
+)
+

 def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
    csvs = sorted(crime_dir.rglob("*.csv"))
@ -14,7 +35,12 @@ def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
    return street_csvs, len(csvs) - len(street_csvs)


-def transform_crime(crime_dir: Path, output_path: Path) -> None:
+def transform_crime(
+    crime_dir: Path,
+    output_path: Path,
+    by_year_output_path: Path | None = None,
+    lsoa_lookup_path: Path | None = None,
+) -> None:
    csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
    if not csvs:
        raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
@ -38,6 +64,8 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
        },
    ).select("LSOA code", "Crime type", "Month")

+    df = _apply_lsoa_2011_to_2021(df, lsoa_lookup_path)
+
    valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
    valid_months = (
        df.filter(valid_month_expr)
@ -57,6 +85,9 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
    )

    # Count monthly incidents, then annualise over every valid month in the dataset.
+    # `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
+    # into N 2021 LSOAs contribute 1/N of their count to each child, since we
+    # don't know which child a given incident actually belonged to.
    yearly_counts = (
        df.filter(
            valid_month_expr
@ -66,7 +97,7 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
            & (pl.col("Crime type") != "")
        )
        .group_by("LSOA code", "Month", "Crime type")
-        .agg(pl.len().alias("count"))
+        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
        .group_by("LSOA code", "Crime type")
        .agg(
            (pl.col("count").sum() / pl.lit(valid_month_count) * 12)
@ -98,6 +129,118 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
    wide.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")

+    if by_year_output_path is not None:
+        _write_crime_by_year(df, valid_month_expr, by_year_output_path)
+
+
+def _write_crime_by_year(
+    df: pl.LazyFrame, valid_month_expr: pl.Expr, by_year_output_path: Path
+) -> None:
+    """Emit per-LSOA per-type per-year crime counts as nested list[struct] columns.
+
+    Partial years are scaled to a 12-month-equivalent count so cross-year trends
+    aren't distorted by months missing from the source data.
+    """
+    filtered = df.filter(
+        valid_month_expr
+        & pl.col("LSOA code").is_not_null()
+        & (pl.col("LSOA code") != "")
+        & pl.col("Crime type").is_not_null()
+        & (pl.col("Crime type") != "")
+    ).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
+
+    # Months observed *anywhere* in the dataset for each year (annualisation denominator).
+    # Using crime-type-specific months would over-scale years where a rare type appears
+    # in only some months.
+    months_per_year = filtered.group_by("year").agg(
+        pl.col("Month").n_unique().alias("months_in_year")
+    )
+
+    yearly_per_type = (
+        filtered.group_by("LSOA code", "Crime type", "year", "Month")
+        .agg((pl.col("_weight").first() * pl.len()).alias("count"))
+        .group_by("LSOA code", "Crime type", "year")
+        .agg(pl.col("count").sum().alias("count"))
+        .join(months_per_year, on="year")
+        .with_columns(
+            (pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
+            .round(1)
+            .alias("count")
+        )
+        .select("LSOA code", "Crime type", "year", "count")
+        .collect(engine="streaming")
+    )
+
+    if yearly_per_type.is_empty():
+        raise ValueError("No valid crime rows for by-year output")
+
+    serious_rollup = _rollup_long(yearly_per_type, SERIOUS_CRIME_TYPES, "Serious crime")
+    minor_rollup = _rollup_long(yearly_per_type, MINOR_CRIME_TYPES, "Minor crime")
+    combined = pl.concat([yearly_per_type, serious_rollup, minor_rollup])
+
+    by_lsoa_type = (
+        combined.sort("year")
+        .group_by("LSOA code", "Crime type")
+        .agg(pl.struct("year", "count").alias("series"))
+    )
+
+    wide_by_year = by_lsoa_type.pivot(
+        on="Crime type", index="LSOA code", values="series"
+    )
+
+    type_cols = [c for c in wide_by_year.columns if c != "LSOA code"]
+    wide_by_year = wide_by_year.rename({col: f"{col} (by year)" for col in type_cols})
+
+    print(f"By-year output shape: {wide_by_year.shape}")
+    print(f"By-year columns: {wide_by_year.columns}")
+
+    wide_by_year.write_parquet(by_year_output_path, compression="zstd")
+    print(f"Saved by-year output to {by_year_output_path}")
+
+
+def _rollup_long(
+    yearly_per_type: pl.DataFrame, types: tuple[str, ...], rollup_name: str
+) -> pl.DataFrame:
+    """Sum per-year counts across a set of crime types into a single rollup type."""
+    return (
+        yearly_per_type.filter(pl.col("Crime type").is_in(list(types)))
+        .group_by("LSOA code", "year")
+        .agg(pl.col("count").sum().round(1).alias("count"))
+        .with_columns(pl.lit(rollup_name).alias("Crime type"))
+        .select("LSOA code", "Crime type", "year", "count")
+    )
+
+
+def _apply_lsoa_2011_to_2021(
+    df: pl.LazyFrame, lsoa_lookup_path: Path | None
+) -> pl.LazyFrame:
+    """Remap pre-2022 LSOA 2011 codes to LSOA 2021 codes.
+
+    Police.uk reports older years using LSOA 2011 codes; the rest of the pipeline
+    keys on LSOA 2021. Without remapping, those years silently fail to join and
+    the crime-over-time chart only shows post-2022 data.
+
+    For 1:1 mappings the LSOA code is rewritten in place. For 1→N splits (one
+    2011 LSOA becoming several 2021 ones), each child gets an even share via
+    `_weight = 1/N` since the source CSVs don't tell us which child a given
+    incident actually fell into.
+    """
+    if lsoa_lookup_path is None:
+        return df.with_columns(pl.lit(1.0).alias("_weight"))
+
+    lookup = pl.scan_parquet(lsoa_lookup_path).select("lsoa11", "lsoa21")
+    weighted = lookup.with_columns(
+        (1.0 / pl.col("lsoa21").count().over("lsoa11")).alias("_weight")
+    )
+    return (
+        df.join(weighted, left_on="LSOA code", right_on="lsoa11", how="left")
+        .with_columns(
+            pl.coalesce("lsoa21", "LSOA code").alias("LSOA code"),
+            pl.col("_weight").fill_null(1.0),
+        )
+        .drop("lsoa21")
+    )
+

 def main() -> None:
    parser = argparse.ArgumentParser(
@ -109,8 +252,22 @@ def main() -> None:
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
+    parser.add_argument(
+        "--output-by-year",
+        type=Path,
+        required=False,
+        help="Optional output parquet for per-LSOA per-year per-type counts (nested list[struct])",
+    )
+    parser.add_argument(
+        "--lsoa-lookup",
+        type=Path,
+        required=False,
+        help="Optional parquet with columns (lsoa11, lsoa21) for remapping pre-2022 codes",
+    )
    args = parser.parse_args()
-    transform_crime(args.input, args.output)
+    transform_crime(
+        args.input, args.output, args.output_by_year, args.lsoa_lookup
+    )


 if __name__ == "__main__":