67 lines
2.1 KiB
Python
67 lines
2.1 KiB
Python
import argparse
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
|
|
def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
|
csvs = sorted(crime_dir.rglob("*.csv"))
|
|
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
|
|
|
|
df = pl.scan_csv(
|
|
csvs,
|
|
schema_overrides={
|
|
"LSOA code": pl.Utf8,
|
|
"Crime type": pl.Utf8,
|
|
"Month": pl.Utf8,
|
|
},
|
|
).select("LSOA code", "Crime type", "Month")
|
|
|
|
# Extract year, count crimes per LSOA / year / crime type
|
|
yearly_counts = (
|
|
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
|
|
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
|
|
.group_by("LSOA code", "year", "Crime type")
|
|
.agg(pl.len().alias("count"))
|
|
.group_by("LSOA code", "Crime type")
|
|
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
|
|
|
|
# Pivot crime types into columns
|
|
wide = yearly_counts.pivot(
|
|
on="Crime type",
|
|
index="LSOA code",
|
|
values="yearly_avg",
|
|
)
|
|
|
|
# Fill nulls with 0 and rename columns to be descriptive
|
|
value_cols = [col for col in wide.columns if col != "LSOA code"]
|
|
wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
|
|
wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})
|
|
|
|
print(f"Output shape: {wide.shape}")
|
|
print(f"Columns: {wide.columns}")
|
|
|
|
wide.write_parquet(output_path, compression="zstd")
|
|
print(f"Saved to {output_path}")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Transform crime CSVs into yearly average by LSOA and crime type"
|
|
)
|
|
parser.add_argument(
|
|
"--input", type=Path, required=True, help="Directory containing crime data"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
args = parser.parse_args()
|
|
transform_crime(args.input, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|