perfect-postcode/pipeline/transform/crime.py

87 lines
2.7 KiB
Python

import argparse
import re
from pathlib import Path
import polars as pl
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
csvs = sorted(crime_dir.rglob("*.csv"))
street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)]
return street_csvs, len(csvs) - len(street_csvs)
def transform_crime(crime_dir: Path, output_path: Path) -> None:
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
if not csvs:
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
month_count = len({path.parent.name for path in csvs})
print(
f"Found {len(csvs)} street crime CSV files across {month_count} months"
+ (
f" (ignored {ignored_csv_count} non-street CSVs)"
if ignored_csv_count
else ""
)
)
df = pl.scan_csv(
csvs,
schema_overrides={
"LSOA code": pl.Utf8,
"Crime type": pl.Utf8,
"Month": pl.Utf8,
},
).select("LSOA code", "Crime type", "Month")
# Extract year, count crimes per LSOA / year / crime type
yearly_counts = (
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.len().alias("count"))
.group_by("LSOA code", "Crime type")
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
.collect(engine="streaming")
)
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
# Pivot crime types into columns
wide = yearly_counts.pivot(
on="Crime type",
index="LSOA code",
values="yearly_avg",
)
# Fill nulls with 0 and rename columns to be descriptive
value_cols = [col for col in wide.columns if col != "LSOA code"]
wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})
print(f"Output shape: {wide.shape}")
print(f"Columns: {wide.columns}")
wide.write_parquet(output_path, compression="zstd")
print(f"Saved to {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Transform crime CSVs into yearly average by LSOA and crime type"
)
parser.add_argument(
"--input", type=Path, required=True, help="Directory containing crime data"
)
parser.add_argument(
"--output", type=Path, required=True, help="Output parquet file path"
)
args = parser.parse_args()
transform_crime(args.input, args.output)
if __name__ == "__main__":
main()