This commit is contained in:
Andras Schmelczer 2026-06-02 13:46:18 +01:00
parent a04ac2d857
commit d43da9708c
47 changed files with 4120 additions and 573 deletions

View file

@ -95,11 +95,14 @@ def transform_crime(
f"({valid_months[0]} to {valid_months[-1]})"
)
# Count monthly incidents, then annualise over every valid month in the dataset.
# `_weight` (≤1) comes from the LSOA 2011→2021 lookup: 2011 LSOAs that split
# into N 2021 LSOAs contribute 1/N of their count to each child, since we
# don't know which child a given incident actually belonged to.
yearly_counts = (
# Annualise each year separately (count_in_year * 12 / months_in_year), then
# take the simple mean of those per-year rates over the years each type is
# present. This makes the headline equal the average of the by-year chart bars
# (_write_crime_by_year) instead of a month-weighted pooled rate, mirroring
# crime_spatial._write_avg_yr. `_weight` (≤1) comes from the LSOA 2011→2021
# lookup: 2011 LSOAs that split into N 2021 LSOAs contribute 1/N of their count
# to each child, since we don't know which child an incident actually belonged to.
filtered = (
df.filter(
valid_month_expr
& pl.col("LSOA code").is_not_null()
@ -107,15 +110,31 @@ def transform_crime(
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
)
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("LSOA code", "Month", "Crime type")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type")
.agg(
(pl.col("count").sum() / pl.lit(valid_month_count) * 12)
.round(1)
.alias("yearly_avg")
.with_columns(
pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"),
pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES),
)
)
# Months observed *anywhere* in the dataset for each year (annualisation
# denominator), matching the by-year output's per-year scaling.
months_per_year = filtered.group_by("year").agg(
pl.col("Month").n_unique().alias("months_in_year")
)
yearly_counts = (
filtered.group_by("LSOA code", "year", "Crime type", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.col("count").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
)
# Mean of the per-year annualised rates over the years the type is present
# (only years with rows are grouped here, so this is the correct x-span).
.group_by("LSOA code", "Crime type")
.agg(pl.col("per_year").mean().round(1).alias("yearly_avg"))
.collect(engine="streaming")
)
if yearly_counts.is_empty():