Add more data & fix ooms
This commit is contained in:
parent
f60fbec9d4
commit
a8cc44ea97
8 changed files with 242 additions and 82 deletions
63
pipeline/transform/crime.py
Normal file
63
pipeline/transform/crime.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
|
||||
def transform_crime(crime_dir: Path, output_path: Path) -> None:
|
||||
csvs = sorted(crime_dir.rglob("*.csv"))
|
||||
print(f"Found {len(csvs)} CSV files across {len(list(crime_dir.iterdir()))} months")
|
||||
|
||||
df = pl.scan_csv(
|
||||
csvs,
|
||||
schema_overrides={"LSOA code": pl.Utf8, "Crime type": pl.Utf8, "Month": pl.Utf8},
|
||||
).select("LSOA code", "Crime type", "Month")
|
||||
|
||||
# Extract year, count crimes per LSOA / year / crime type
|
||||
yearly_counts = (
|
||||
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
|
||||
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
|
||||
.group_by("LSOA code", "year", "Crime type")
|
||||
.agg(pl.len().alias("count"))
|
||||
.group_by("LSOA code", "Crime type")
|
||||
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
|
||||
|
||||
# Pivot crime types into columns
|
||||
wide = yearly_counts.pivot(
|
||||
on="Crime type",
|
||||
index="LSOA code",
|
||||
values="yearly_avg",
|
||||
)
|
||||
|
||||
# Fill nulls with 0 and rename columns to be descriptive
|
||||
value_cols = [col for col in wide.columns if col != "LSOA code"]
|
||||
wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
|
||||
wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})
|
||||
|
||||
print(f"Output shape: {wide.shape}")
|
||||
print(f"Columns: {wide.columns}")
|
||||
|
||||
wide.write_parquet(output_path, compression="zstd")
|
||||
print(f"Saved to {output_path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Transform crime CSVs into yearly average by LSOA and crime type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input", type=Path, required=True, help="Directory containing crime data"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output parquet file path"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
transform_crime(args.input, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue