This commit is contained in:
Andras Schmelczer 2026-05-14 08:09:19 +01:00
parent a8165249a4
commit a4103b0896
64 changed files with 5376 additions and 3832 deletions

View file

@ -5,6 +5,7 @@ from pathlib import Path
import polars as pl
STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
MONTH_RE = r"^\d{4}-\d{2}$"
def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
@ -37,16 +38,45 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
},
).select("LSOA code", "Crime type", "Month")
# Extract year, count crimes per LSOA / year / crime type
valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
valid_months = (
df.filter(valid_month_expr)
.select("Month")
.unique()
.collect(engine="streaming")["Month"]
.sort()
.to_list()
)
if not valid_months:
raise ValueError(f"No valid crime months found in {crime_dir}")
valid_month_count = len(valid_months)
print(
f"Using {valid_month_count} valid data months "
f"({valid_months[0]} to {valid_months[-1]})"
)
# Count monthly incidents, then annualise over every valid month in the dataset.
yearly_counts = (
df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
.with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
.group_by("LSOA code", "year", "Crime type")
df.filter(
valid_month_expr
& pl.col("LSOA code").is_not_null()
& (pl.col("LSOA code") != "")
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
)
.group_by("LSOA code", "Month", "Crime type")
.agg(pl.len().alias("count"))
.group_by("LSOA code", "Crime type")
.agg(pl.col("count").mean().round(1).alias("yearly_avg"))
.agg(
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
.round(1)
.alias("yearly_avg")
)
.collect(engine="streaming")
)
if yearly_counts.is_empty():
raise ValueError(f"No valid crime rows found in {crime_dir}")
print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")