Improve data
This commit is contained in:
parent
b4d66a28c1
commit
85da1941aa
31 changed files with 901 additions and 319 deletions
|
|
@ -273,27 +273,24 @@ def _write_avg_yr(
|
|||
for type_idx, name in enumerate(ALL_CRIME_TYPES):
|
||||
data[f"{name} (avg/yr)"] = avg[:, type_idx]
|
||||
|
||||
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
|
||||
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
|
||||
# average over the years in which ANY of those types occurred. This keeps the
|
||||
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
|
||||
# Summing the per-type avg/yr values instead (as the merge previously did)
|
||||
# divides each type by its OWN years-present and overstates the rollup when a
|
||||
# postcode's serious/minor types occur in disjoint years.
|
||||
# Serious/Minor rollup headlines = the exact SUM of their component (avg/yr)
|
||||
# columns, so each rollup always equals the sum of the parts shown beside it
|
||||
# and can never fall below one of its own components. (Previously the rollup
|
||||
# re-derived a union-years-present mean: it divided the summed counts by the
|
||||
# number of years in which ANY component type occurred, whereas each
|
||||
# component divides by its OWN years-present. When a postcode's serious/minor
|
||||
# types occurred in disjoint years the union denominator was larger, so the
|
||||
# rollup came out smaller than the sum of its parts.) The by-year rollup
|
||||
# series in _write_by_year is likewise the per-year sum of the component
|
||||
# bars, so headline and chart both present the rollup as the sum of its parts.
|
||||
for rollup_name, rollup_types in (
|
||||
("Serious crime", SERIOUS_CRIME_TYPES),
|
||||
("Minor crime", MINOR_CRIME_TYPES),
|
||||
):
|
||||
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
|
||||
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
|
||||
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
|
||||
rollup_years_present = np.clip(
|
||||
(rollup_counts > 0).sum(axis=1), 1, None
|
||||
).astype(np.float64)
|
||||
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
|
||||
np.float32
|
||||
)
|
||||
data[f"{rollup_name} (avg/yr)"] = np.round(
|
||||
avg[:, rollup_idx].sum(axis=1), 1
|
||||
).astype(np.float32)
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue