perfect-postcode/pipeline/transform/crime.py
2026-02-07 19:13:36 +00:00

67 lines
2.1 KiB
Python

import argparse
from pathlib import Path
import polars as pl
def transform_crime(crime_dir: Path, output_path: Path) -> None:
csvs = sorted(crime_dir.rglob("*.csv"))
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
df = pl.scan_csv(
csvs,
schema_overrides={
"LSOA code": pl.Utf8,
"Crime type": pl.Utf8,
"Month": pl.Utf8,
},
).select("LSOA code", "Crime type", "Month")
# Extract year, count crimes per LSOA / year / crime type
yearly_counts = (
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.len().alias("count"))
.group_by("LSOA code", "Crime type")
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
.collect(engine="streaming")
)
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
# Pivot crime types into columns
wide = yearly_counts.pivot(
on="Crime type",
index="LSOA code",
values="yearly_avg",
)
# Fill nulls with 0 and rename columns to be descriptive
value_cols = [col for col in wide.columns if col != "LSOA code"]
wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})
print(f"Output shape: {wide.shape}")
print(f"Columns: {wide.columns}")
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Transform crime CSVs into yearly average by LSOA and crime type"
)
parser.add_argument(
"--input", type=Path, required=True, help="Directory containing crime data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
transform_crime(args.input, args.output)
if __name__ == "__main__":
main()