perfect-postcode/server-rs/src/data/crime_by_year.rs

230 lines
9.3 KiB
Rust

//! Per-postcode per-crime-type per-year crime counts, loaded from a side
//! parquet and used by the right pane to plot crime-over-time. Filtering is not
//! supported — this data is display-only.
use std::path::Path;
use anyhow::{bail, Context};
use polars::lazy::frame::LazyFrame;
use polars::prelude::PlRefPath;
use polars::prelude::*;
use rustc_hash::FxHashMap;
use tracing::info;
use super::run_polars_io;
/// Suffix appended to the underlying crime-type column name in the parquet
/// (e.g. `"Burglary (by year)"`). Stripped to derive the display name.
pub const BY_YEAR_SUFFIX: &str = " (by year)";
/// Per-postcode police-force coverage calendar column: `list[struct{year,
/// months}]` of the years the postcode's home force published enough months.
/// police.uk has multi-year publication gaps for whole forces (e.g. Greater
/// Manchester 2019-07 onwards), and a missing year is *no data*, not zero
/// crime — consumers must exclude uncovered (postcode, year)s instead of
/// charting them as zeros.
pub const COVERAGE_COLUMN: &str = "covered_years";
#[derive(Clone, Copy)]
pub struct YearPoint {
pub year: i32,
pub count: f32,
}
/// One per crime type: ordered list of (year, count) for a single postcode.
pub struct PostcodeCrimeSeries {
/// Index into `crime_types`.
pub type_idx: u16,
pub points: Vec<YearPoint>,
}
pub struct CrimeByYearData {
/// All crime type names in stable insertion order.
pub crime_types: Vec<String>,
/// All years available for each crime type, same order as `crime_types`.
pub years_by_type: Vec<Vec<i32>>,
/// Postcode → all available per-type series for that postcode.
pub series_by_postcode: FxHashMap<String, Vec<PostcodeCrimeSeries>>,
/// Postcode → years its police force actually published data for (from
/// the `covered_years` column). An EMPTY vec means the postcode's crime
/// picture is unknown (force gap / unusable geometry) — it must not count
/// toward any year. A postcode ABSENT from this map (legacy parquet
/// without the column) is treated as covered for every year.
pub covered_years_by_postcode: FxHashMap<String, Vec<i32>>,
}
impl CrimeByYearData {
pub fn load(path: &Path) -> anyhow::Result<Self> {
run_polars_io(|| Self::load_inner(path))
}
fn load_inner(path: &Path) -> anyhow::Result<Self> {
info!("Loading crime-by-year from {}", path.display());
let pl_path = PlRefPath::try_from_path(path).with_context(|| {
format!(
"Failed to normalize crime-by-year parquet path {}",
path.display()
)
})?;
let df = LazyFrame::scan_parquet(pl_path, Default::default())
.with_context(|| format!("Failed to scan crime-by-year parquet at {}", path.display()))?
.collect()
.with_context(|| {
format!("Failed to read crime-by-year parquet at {}", path.display())
})?;
let postcode_col = df
.column("postcode")
.context("crime-by-year parquet missing 'postcode' column")?
.str()
.context("'postcode' column is not a string")?;
let postcode_values: Vec<String> = postcode_col
.into_iter()
.enumerate()
.map(|(row, value)| {
let value =
value.with_context(|| format!("crime-by-year row {row} has null postcode"))?;
let trimmed = value.trim();
if trimmed.is_empty() {
bail!("crime-by-year row {row} has blank postcode");
}
Ok(trimmed.to_string())
})
.collect::<anyhow::Result<Vec<_>>>()?;
// Discover crime-type columns (anything with the by-year suffix).
let crime_type_cols: Vec<(String, String)> = df
.get_column_names()
.iter()
.filter_map(|name| {
let name = name.as_str();
name.strip_suffix(BY_YEAR_SUFFIX)
.map(|stripped| (stripped.to_string(), name.to_string()))
})
.collect();
if crime_type_cols.is_empty() {
bail!(
"crime-by-year parquet at {} has no '* (by year)' columns",
path.display()
);
}
let crime_types: Vec<String> = crime_type_cols.iter().map(|(t, _)| t.clone()).collect();
let mut series_by_postcode: FxHashMap<String, Vec<PostcodeCrimeSeries>> =
FxHashMap::default();
let mut years_by_type: Vec<Vec<i32>> = Vec::with_capacity(crime_type_cols.len());
let row_count = df.height();
for (type_idx, (_, col_name)) in crime_type_cols.iter().enumerate() {
let mut years_for_type = std::collections::BTreeSet::new();
let col = df
.column(col_name)
.with_context(|| format!("Missing crime-by-year column '{col_name}'"))?;
let list_ca = col
.list()
.with_context(|| format!("Column '{col_name}' is not a list"))?;
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
let Some(inner) = list_ca.get_as_series(row) else {
continue;
};
if inner.is_empty() {
continue;
}
let structs = inner
.struct_()
.with_context(|| format!("Inner of '{col_name}' is not a struct"))?;
let years = structs
.field_by_name("year")
.with_context(|| format!("Missing 'year' field in '{col_name}'"))?;
let counts = structs
.field_by_name("count")
.with_context(|| format!("Missing 'count' field in '{col_name}'"))?;
let mut points: Vec<YearPoint> = Vec::with_capacity(inner.len());
for idx in 0..inner.len() {
let yr = match years.get(idx).ok() {
Some(AnyValue::Int32(y)) => y,
Some(AnyValue::Int64(y)) => y as i32,
_ => continue,
};
let cnt = match counts.get(idx).ok() {
Some(AnyValue::Float32(c)) => c,
Some(AnyValue::Float64(c)) => c as f32,
Some(AnyValue::Int32(c)) => c as f32,
Some(AnyValue::Int64(c)) => c as f32,
_ => continue,
};
points.push(YearPoint {
year: yr,
count: cnt,
});
years_for_type.insert(yr);
}
if points.is_empty() {
continue;
}
points.sort_by_key(|p| p.year);
series_by_postcode
.entry(postcode.clone())
.or_default()
.push(PostcodeCrimeSeries {
type_idx: type_idx as u16,
points,
});
}
years_by_type.push(years_for_type.into_iter().collect());
}
// Force-coverage calendar (optional column: legacy parquets predate it;
// their postcodes are treated as fully covered). A row with an empty
// list is meaningful — zero covered years — so it IS inserted.
let mut covered_years_by_postcode: FxHashMap<String, Vec<i32>> =
FxHashMap::default();
if let Ok(col) = df.column(COVERAGE_COLUMN) {
let list_ca = col
.list()
.with_context(|| format!("Column '{COVERAGE_COLUMN}' is not a list"))?;
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
let Some(inner) = list_ca.get_as_series(row) else {
// Null coverage: treat as legacy/fully covered (skip).
continue;
};
let mut years: Vec<i32> = Vec::with_capacity(inner.len());
if !inner.is_empty() {
let structs = inner.struct_().with_context(|| {
format!("Inner of '{COVERAGE_COLUMN}' is not a struct")
})?;
let year_field = structs.field_by_name("year").with_context(|| {
format!("Missing 'year' field in '{COVERAGE_COLUMN}'")
})?;
for idx in 0..inner.len() {
match year_field.get(idx).ok() {
Some(AnyValue::Int32(y)) => years.push(y),
Some(AnyValue::Int64(y)) => years.push(y as i32),
_ => continue,
}
}
}
covered_years_by_postcode.insert(postcode.clone(), years);
}
}
info!(
postcodes = series_by_postcode.len(),
crime_types = crime_types.len(),
with_coverage = covered_years_by_postcode.len(),
"Crime-by-year data loaded"
);
Ok(Self {
crime_types,
years_by_type,
series_by_postcode,
covered_years_by_postcode,
})
}
}