274 lines
9.2 KiB
Python
274 lines
9.2 KiB
Python
import argparse
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
|
|
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
|
|
MONTH_RE = r"^\d{4}-\d{2}$"
|
|
|
|
# Crime types that roll up into "Serious crime" / "Minor crime" aggregates.
|
|
# Must match the names used in pipeline/transform/merge.py for the sum_horizontal expressions.
|
|
SERIOUS_CRIME_TYPES = (
|
|
"Violence and sexual offences",
|
|
"Robbery",
|
|
"Burglary",
|
|
"Possession of weapons",
|
|
)
|
|
MINOR_CRIME_TYPES = (
|
|
"Anti-social behaviour",
|
|
"Criminal damage and arson",
|
|
"Shoplifting",
|
|
"Bicycle theft",
|
|
"Theft from the person",
|
|
"Other theft",
|
|
"Vehicle crime",
|
|
"Public order",
|
|
"Drugs",
|
|
"Other crime",
|
|
)
|
|
|
|
|
|
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
|
|
csvs = sorted(crime_dir.rglob("*.csv"))
|
|
street_csvs = [path for path in csvs if STREET_CRIME_CSV_RE.fullmatch(path.name)]
|
|
return street_csvs, len(csvs) - len(street_csvs)
|
|
|
|
|
|
def transform_crime(
|
|
crime_dir: Path,
|
|
output_path: Path,
|
|
by_year_output_path: Path | None = None,
|
|
lsoa_lookup_path: Path | None = None,
|
|
) -> None:
|
|
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
|
|
if not csvs:
|
|
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
|
|
|
|
month_count = len({path.parent.name for path in csvs})
|
|
print(
|
|
f"Found {len(csvs)} street crime CSV files across {month_count} months"
|
|
+ (
|
|
f" (ignored {ignored_csv_count} non-street CSVs)"
|
|
if ignored_csv_count
|
|
else ""
|
|
)
|
|
)
|
|
|
|
df = pl.scan_csv(
|
|
csvs,
|
|
schema_overrides={
|
|
"LSOA code": pl.Utf8,
|
|
"Crime type": pl.Utf8,
|
|
"Month": pl.Utf8,
|
|
},
|
|
).select("LSOA code", "Crime type", "Month")
|
|
|
|
df = _apply_lsoa_2011_to_2021(df, lsoa_lookup_path)
|
|
|
|
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
|
|
valid_months = (
|
|
df.filter(valid_month_expr)
|
|
.select("Month")
|
|
.unique()
|
|
.collect(engine="streaming")["Month"]
|
|
.sort()
|
|
.to_list()
|
|
)
|
|
if not valid_months:
|
|
raise ValueError(f"No valid crime months found in {crime_dir}")
|
|
|
|
valid_month_count = len(valid_months)
|
|
print(
|
|
f"Using {valid_month_count} valid data months "
|
|
f"({valid_months[0]} to {valid_months[-1]})"
|
|
)
|
|
|
|
# Count monthly incidents, then annualise over every valid month in the dataset.
|
|
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
|
|
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
|
|
# don't know which child a given incident actually belonged to.
|
|
yearly_counts = (
|
|
df.filter(
|
|
valid_month_expr
|
|
& pl.col("LSOA code").is_not_null()
|
|
& (pl.col("LSOA code") != "")
|
|
& pl.col("Crime type").is_not_null()
|
|
& (pl.col("Crime type") != "")
|
|
)
|
|
.group_by("LSOA code", "Month", "Crime type")
|
|
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
|
.group_by("LSOA code", "Crime type")
|
|
.agg(
|
|
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
|
|
.round(1)
|
|
.alias("yearly_avg")
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
if yearly_counts.is_empty():
|
|
raise ValueError(f"No valid crime rows found in {crime_dir}")
|
|
|
|
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")
|
|
|
|
# Pivot crime types into columns
|
|
wide = yearly_counts.pivot(
|
|
on="Crime type",
|
|
index="LSOA code",
|
|
values="yearly_avg",
|
|
)
|
|
|
|
# Fill nulls with 0 and rename columns to be descriptive
|
|
value_cols = [col for col in wide.columns if col != "LSOA code"]
|
|
wide = wide.with_columns(pl.col(col).fill_null(0) for col in value_cols)
|
|
wide = wide.rename({col: f"{col} (avg/yr)" for col in value_cols})
|
|
|
|
print(f"Output shape: {wide.shape}")
|
|
print(f"Columns: {wide.columns}")
|
|
|
|
wide.write_parquet(output_path, compression="zstd")
|
|
print(f"Saved to {output_path}")
|
|
|
|
if by_year_output_path is not None:
|
|
_write_crime_by_year(df, valid_month_expr, by_year_output_path)
|
|
|
|
|
|
def _write_crime_by_year(
|
|
df: pl.LazyFrame, valid_month_expr: pl.Expr, by_year_output_path: Path
|
|
) -> None:
|
|
"""Emit per-LSOA per-type per-year crime counts as nested list[struct] columns.
|
|
|
|
Partial years are scaled to a 12-month-equivalent count so cross-year trends
|
|
aren't distorted by months missing from the source data.
|
|
"""
|
|
filtered = df.filter(
|
|
valid_month_expr
|
|
& pl.col("LSOA code").is_not_null()
|
|
& (pl.col("LSOA code") != "")
|
|
& pl.col("Crime type").is_not_null()
|
|
& (pl.col("Crime type") != "")
|
|
).with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
|
|
|
|
# Months observed *anywhere* in the dataset for each year (annualisation denominator).
|
|
# Using crime-type-specific months would over-scale years where a rare type appears
|
|
# in only some months.
|
|
months_per_year = filtered.group_by("year").agg(
|
|
pl.col("Month").n_unique().alias("months_in_year")
|
|
)
|
|
|
|
yearly_per_type = (
|
|
filtered.group_by("LSOA code", "Crime type", "year", "Month")
|
|
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
|
|
.group_by("LSOA code", "Crime type", "year")
|
|
.agg(pl.col("count").sum().alias("count"))
|
|
.join(months_per_year, on="year")
|
|
.with_columns(
|
|
(pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))
|
|
.round(1)
|
|
.alias("count")
|
|
)
|
|
.select("LSOA code", "Crime type", "year", "count")
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
if yearly_per_type.is_empty():
|
|
raise ValueError("No valid crime rows for by-year output")
|
|
|
|
serious_rollup = _rollup_long(yearly_per_type, SERIOUS_CRIME_TYPES, "Serious crime")
|
|
minor_rollup = _rollup_long(yearly_per_type, MINOR_CRIME_TYPES, "Minor crime")
|
|
combined = pl.concat([yearly_per_type, serious_rollup, minor_rollup])
|
|
|
|
by_lsoa_type = (
|
|
combined.sort("year")
|
|
.group_by("LSOA code", "Crime type")
|
|
.agg(pl.struct("year", "count").alias("series"))
|
|
)
|
|
|
|
wide_by_year = by_lsoa_type.pivot(
|
|
on="Crime type", index="LSOA code", values="series"
|
|
)
|
|
|
|
type_cols = [c for c in wide_by_year.columns if c != "LSOA code"]
|
|
wide_by_year = wide_by_year.rename({col: f"{col} (by year)" for col in type_cols})
|
|
|
|
print(f"By-year output shape: {wide_by_year.shape}")
|
|
print(f"By-year columns: {wide_by_year.columns}")
|
|
|
|
wide_by_year.write_parquet(by_year_output_path, compression="zstd")
|
|
print(f"Saved by-year output to {by_year_output_path}")
|
|
|
|
|
|
def _rollup_long(
|
|
yearly_per_type: pl.DataFrame, types: tuple[str, ...], rollup_name: str
|
|
) -> pl.DataFrame:
|
|
"""Sum per-year counts across a set of crime types into a single rollup type."""
|
|
return (
|
|
yearly_per_type.filter(pl.col("Crime type").is_in(list(types)))
|
|
.group_by("LSOA code", "year")
|
|
.agg(pl.col("count").sum().round(1).alias("count"))
|
|
.with_columns(pl.lit(rollup_name).alias("Crime type"))
|
|
.select("LSOA code", "Crime type", "year", "count")
|
|
)
|
|
|
|
|
|
def _apply_lsoa_2011_to_2021(
|
|
df: pl.LazyFrame, lsoa_lookup_path: Path | None
|
|
) -> pl.LazyFrame:
|
|
"""Remap pre-2022 LSOA 2011 codes to LSOA 2021 codes.
|
|
|
|
Police.uk reports older years using LSOA 2011 codes; the rest of the pipeline
|
|
keys on LSOA 2021. Without remapping, those years silently fail to join and
|
|
the crime-over-time chart only shows post-2022 data.
|
|
|
|
For 1:1 mappings the LSOA code is rewritten in place. For 1→N splits (one
|
|
2011 LSOA becoming several 2021 ones), each child gets an even share via
|
|
`_weight = 1/N` since the source CSVs don't tell us which child a given
|
|
incident actually fell into.
|
|
"""
|
|
if lsoa_lookup_path is None:
|
|
return df.with_columns(pl.lit(1.0).alias("_weight"))
|
|
|
|
lookup = pl.scan_parquet(lsoa_lookup_path).select("lsoa11", "lsoa21")
|
|
weighted = lookup.with_columns(
|
|
(1.0 / pl.col("lsoa21").count().over("lsoa11")).alias("_weight")
|
|
)
|
|
return (
|
|
df.join(weighted, left_on="LSOA code", right_on="lsoa11", how="left")
|
|
.with_columns(
|
|
pl.coalesce("lsoa21", "LSOA code").alias("LSOA code"),
|
|
pl.col("_weight").fill_null(1.0),
|
|
)
|
|
.drop("lsoa21")
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Transform crime CSVs into yearly average by LSOA and crime type"
|
|
)
|
|
parser.add_argument(
|
|
"--input", type=Path, required=True, help="Directory containing crime data"
|
|
)
|
|
parser.add_argument(
|
|
"--output", type=Path, required=True, help="Output parquet file path"
|
|
)
|
|
parser.add_argument(
|
|
"--output-by-year",
|
|
type=Path,
|
|
required=False,
|
|
help="Optional output parquet for per-LSOA per-year per-type counts (nested list[struct])",
|
|
)
|
|
parser.add_argument(
|
|
"--lsoa-lookup",
|
|
type=Path,
|
|
required=False,
|
|
help="Optional parquet with columns (lsoa11, lsoa21) for remapping pre-2022 codes",
|
|
)
|
|
args = parser.parse_args()
|
|
transform_crime(
|
|
args.input, args.output, args.output_by_year, args.lsoa_lookup
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|