import argparse import re from pathlib import Path import polars as pl STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$") MONTH_RE = r"^\d{4}-\d{2}$" # Crime types that roll up into "Serious crime" / "Minor crime" aggregates. # Must match the names used in pipeline/transform/merge.py for the sum_horizontal expressions. SERIOUS_CRIME_TYPES = ( "Violence and sexual offences", "Robbery", "Burglary", "Possession of weapons", ) MINOR_CRIME_TYPES = ( "Anti-social behaviour", "Criminal damage and arson", "Shoplifting", "Bicycle theft", "Theft from the person", "Other theft", "Vehicle crime", "Public order", "Drugs", "Other crime", ) # Legacy police.uk crime-type names (pre-2014 taxonomy) mapped to their closest # current equivalent. Without this, ~1.9M incidents from 2010-2013 ("Violent # crime", "Public disorder and weapons") are unrecognised and silently dropped, # which understates pre-2013 serious crime and creates an artificial 2012->2013 # step in the by-year series. Applied with `.replace` (not `.replace_strict`) so # unmapped current types pass through unchanged. LEGACY_CRIME_TYPE_ALIASES = { "Violent crime": "Violence and sexual offences", "Public disorder and weapons": "Public order", } def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]: csvs = sorted(crime_dir.rglob("*.csv")) street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)] return street_csvs, len(csvs) - len(street_csvs) def transform_crime( crime_dir: Path, output_path: Path, by_year_output_path: Path | None = None, lsoa_lookup_path: Path | None = None, ) -> None: csvs, ignored_csv_count = find_street_crime_csvs(crime_dir) if not csvs: raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}") month_count = len({path.parent.name for path in csvs}) print( f"Found {len(csvs)} street crime CSV files across {month_count} months" + ( f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "" ) ) df = pl.scan_csv( csvs, schema_overrides={ "LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8, }, ).select("LSOA code", "Crime type", "Month") df = _apply_lsoa_2011_to_2021(df, lsoa_lookup_path) valid_month_expr = pl.col("Month").str.contains(MONTH_RE) valid_months = ( df.filter(valid_month_expr) .select("Month") .unique() .collect(engine="streaming")["Month"] .sort() .to_list() ) if not valid_months: raise ValueError(f"No valid crime months found in {crime_dir}") valid_month_count = len(valid_months) print( f"Using {valid_month_count} valid data months " f"({valid_months[0]} to {valid_months[-1]})" ) # Annualise each year separately (count_in_year * 12 / months_in_year), then # take the simple mean of those per-year rates over the years each type is # present. This makes the headline equal the average of the by-year chart bars # (_write_crime_by_year) instead of a month-weighted pooled rate, mirroring # crime_spatial._write_avg_yr. `_weight` (≤1) comes from the LSOA 2011→2021 # lookup: 2011 LSOAs that split into N 2021 LSOAs contribute 1/N of their count # to each child, since we don't know which child an incident actually belonged to. filtered = ( df.filter( valid_month_expr & pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != "") & pl.col("Crime type").is_not_null() & (pl.col("Crime type") != "") ) .with_columns( pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"), pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES), ) ) # Months observed *anywhere* in the dataset for each year (annualisation # denominator), matching the by-year output's per-year scaling. months_per_year = filtered.group_by("year").agg( pl.col("Month").n_unique().alias("months_in_year") ) yearly_counts = ( # Sum per-incident weights directly: a 2021 LSOA can receive incidents # carrying different `_weight`s in the same month (split 2011 parent at # 1/N alongside an unsplit one at 1), so `_weight.first() * len` would # apply one row's weight to all of them — and nondeterministically so, # since `first` after a join has no ordering guarantee. filtered.group_by("LSOA code", "year", "Crime type") .agg(pl.col("_weight").sum().alias("count")) .join(months_per_year, on="year") .with_columns( (pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year") ) # Mean of the per-year annualised rates over the years the type is present # (only years with rows are grouped here, so this is the correct x-span). .group_by("LSOA code", "Crime type") .agg(pl.col("per_year").mean().round(1).alias("yearly_avg")) .collect(engine="streaming") ) if yearly_counts.is_empty(): raise ValueError(f"No valid crime rows found in {crime_dir}") print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}") # Pivot crime types into columns wide = yearly_counts.pivot( on="Crime type", index="LSOA code", values="yearly_avg", ) # Fill nulls with 0 and rename columns to be descriptive value_cols = [col for col in wide.columns if col != "LSOA code"] wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols) wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols}) print(f"Output shape: {wide.shape}") print(f"Columns: {wide.columns}") wide.write_parquet(output_path, compression="zstd") print(f"Saved to {output_path}") if by_year_output_path is not None: _write_crime_by_year(df, valid_month_expr, by_year_output_path) def _write_crime_by_year( df: pl.LazyFrame, valid_month_expr: pl.Expr, by_year_output_path: Path ) -> None: """Emit per-LSOA per-type per-year crime counts as nested list[struct] columns. Partial years are scaled to a 12-month-equivalent count so cross-year trends aren't distorted by months missing from the source data. """ filtered = df.filter( valid_month_expr & pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != "") & pl.col("Crime type").is_not_null() & (pl.col("Crime type") != "") ).with_columns( pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"), pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES), ) # Months observed *anywhere* in the dataset for each year (annualisation denominator). # Using crime-type-specific months would over-scale years where a rare type appears # in only some months. months_per_year = filtered.group_by("year").agg( pl.col("Month").n_unique().alias("months_in_year") ) yearly_per_type = ( # Per-incident weight sum, not `_weight.first() * len` — see the # matching comment in transform_crime. filtered.group_by("LSOA code", "Crime type", "year") .agg(pl.col("_weight").sum().alias("count")) .join(months_per_year, on="year") .with_columns( (pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year")) .round(1) .alias("count") ) .select("LSOA code", "Crime type", "year", "count") .collect(engine="streaming") ) if yearly_per_type.is_empty(): raise ValueError("No valid crime rows for by-year output") serious_rollup = _rollup_long(yearly_per_type, SERIOUS_CRIME_TYPES, "Serious crime") minor_rollup = _rollup_long(yearly_per_type, MINOR_CRIME_TYPES, "Minor crime") combined = pl.concat([yearly_per_type, serious_rollup, minor_rollup]) by_lsoa_type = ( combined.sort("year") .group_by("LSOA code", "Crime type") .agg(pl.struct("year", "count").alias("series")) ) wide_by_year = by_lsoa_type.pivot( on="Crime type", index="LSOA code", values="series" ) type_cols = [c for c in wide_by_year.columns if c != "LSOA code"] wide_by_year = wide_by_year.rename({col: f"{col} (by year)" for col in type_cols}) print(f"By-year output shape: {wide_by_year.shape}") print(f"By-year columns: {wide_by_year.columns}") wide_by_year.write_parquet(by_year_output_path, compression="zstd") print(f"Saved by-year output to {by_year_output_path}") def _rollup_long( yearly_per_type: pl.DataFrame, types: tuple[str, ...], rollup_name: str ) -> pl.DataFrame: """Sum per-year counts across a set of crime types into a single rollup type.""" return ( yearly_per_type.filter(pl.col("Crime type").is_in(list(types))) .group_by("LSOA code", "year") .agg(pl.col("count").sum().round(1).alias("count")) .with_columns(pl.lit(rollup_name).alias("Crime type")) .select("LSOA code", "Crime type", "year", "count") ) def _apply_lsoa_2011_to_2021( df: pl.LazyFrame, lsoa_lookup_path: Path | None ) -> pl.LazyFrame: """Remap pre-2022 LSOA 2011 codes to LSOA 2021 codes. Police.uk reports older years using LSOA 2011 codes; the rest of the pipeline keys on LSOA 2021. Without remapping, those years silently fail to join and the crime-over-time chart only shows post-2022 data. For 1:1 mappings the LSOA code is rewritten in place. For 1→N splits (one 2011 LSOA becoming several 2021 ones), each child gets an even share via `_weight = 1/N` since the source CSVs don't tell us which child a given incident actually fell into. """ if lsoa_lookup_path is None: return df.with_columns(pl.lit(1.0).alias("_weight")) lookup = pl.scan_parquet(lsoa_lookup_path).select("lsoa11", "lsoa21") weighted = lookup.with_columns( (1.0 / pl.col("lsoa21").count().over("lsoa11")).alias("_weight") ) return ( df.join(weighted, left_on="LSOA code", right_on="lsoa11", how="left") .with_columns( pl.coalesce("lsoa21", "LSOA code").alias("LSOA code"), pl.col("_weight").fill_null(1.0), ) .drop("lsoa21") ) def main() -> None: parser = argparse.ArgumentParser( description="Transform crime CSVs into yearly average by LSOA and crime type" ) parser.add_argument( "--input", type=Path, required=True, help="Directory containing crime data" ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet file path" ) parser.add_argument( "--output-by-year", type=Path, required=False, help="Optional output parquet for per-LSOA per-year per-type counts (nested list[struct])", ) parser.add_argument( "--lsoa-lookup", type=Path, required=False, help="Optional parquet with columns (lsoa11, lsoa21) for remapping pre-2022 codes", ) args = parser.parse_args() transform_crime( args.input, args.output, args.output_by_year, args.lsoa_lookup ) if __name__ == "__main__": main()