SPlit up
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s

This commit is contained in:
Andras Schmelczer 2026-06-12 21:51:37 +01:00
parent cf39ad754e
commit f59d01227b
91 changed files with 10370 additions and 7562 deletions

View file

@ -123,10 +123,13 @@ def transform_crime(
)
yearly_counts = (
filtered.group_by("LSOA code", "year", "Crime type", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "year", "Crime type")
.agg(pl.col("count").sum().alias("count"))
# Sum per-incident weights directly: a 2021 LSOA can receive incidents
# carrying different `_weight`s in the same month (split 2011 parent at
# 1/N alongside an unsplit one at 1), so `_weight.first() * len` would
# apply one row's weight to all of them — and nondeterministically so,
# since `first` after a join has no ordering guarantee.
filtered.group_by("LSOA code", "year", "Crime type")
.agg(pl.col("_weight").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count") * 12.0 / pl.col("months_in_year")).alias("per_year")
@ -191,10 +194,10 @@ def _write_crime_by_year(
)
yearly_per_type = (
filtered.group_by("LSOA code", "Crime type", "year", "Month")
.agg((pl.col("_weight").first() * pl.len()).alias("count"))
.group_by("LSOA code", "Crime type", "year")
.agg(pl.col("count").sum().alias("count"))
# Per-incident weight sum, not `_weight.first() * len` — see the
# matching comment in transform_crime.
filtered.group_by("LSOA code", "Crime type", "year")
.agg(pl.col("_weight").sum().alias("count"))
.join(months_per_year, on="year")
.with_columns(
(pl.col("count").cast(pl.Float32) * 12.0 / pl.col("months_in_year"))