perfect-postcode/pipeline/transform/crime.py

import argparse
from pathlib import Path

import polars as pl


def transform_crime(crime_dir: Path, output_path: Path) -> None:
    csvs = sorted(crime_dir.rglob("*.csv"))
    print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")

    df = pl.scan_csv(
        csvs,
        schema_overrides={
            "LSOA code": pl.Utf8,
            "Crime type": pl.Utf8,
            "Month": pl.Utf8,
        },
    ).select("LSOA code", "Crime type", "Month")

    # Extract year, count crimes per LSOA / year / crime type
    yearly_counts = (
        df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
        .with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
        .group_by("LSOA code", "year", "Crime type")
        .agg(pl.len().alias("count"))
        .group_by("LSOA code", "Crime type")
        .agg(pl.col("count").mean().round(1).alias("yearly_avg"))
        .collect(engine="streaming")
    )

    print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")

    # Pivot crime types into columns
    wide = yearly_counts.pivot(
        on="Crime type",
        index="LSOA code",
        values="yearly_avg",
    )

    # Fill nulls with 0 and rename columns to be descriptive
    value_cols = [col for col in wide.columns if col != "LSOA code"]
    wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
    wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})

    print(f"Output shape: {wide.shape}")
    print(f"Columns: {wide.columns}")

    wide.write_parquet(output_path, compression="zstd")
    print(f"Saved to {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Transform crime CSVs into yearly average by LSOA and crime type"
    )
    parser.add_argument(
        "--input", type=Path, required=True, help="Directory containing crime data"
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
    args = parser.parse_args()
    transform_crime(args.input, args.output)


if __name__ == "__main__":
    main()