//! Per-postcode per-crime-type per-year crime counts, loaded from a side //! parquet and used by the right pane to plot crime-over-time. Filtering is not //! supported — this data is display-only. use std::path::Path; use anyhow::{bail, Context}; use polars::lazy::frame::LazyFrame; use polars::prelude::PlRefPath; use polars::prelude::*; use rustc_hash::FxHashMap; use tracing::info; use super::run_polars_io; /// Suffix appended to the underlying crime-type column name in the parquet /// (e.g. `"Burglary (by year)"`). Stripped to derive the display name. pub const BY_YEAR_SUFFIX: &str = " (by year)"; /// Per-postcode police-force coverage calendar column: `list[struct{year, /// months}]` of the years the postcode's home force published enough months. /// police.uk has multi-year publication gaps for whole forces (e.g. Greater /// Manchester 2019-07 onwards), and a missing year is *no data*, not zero /// crime — consumers must exclude uncovered (postcode, year)s instead of /// charting them as zeros. pub const COVERAGE_COLUMN: &str = "covered_years"; #[derive(Clone, Copy)] pub struct YearPoint { pub year: i32, pub count: f32, } /// One per crime type: ordered list of (year, count) for a single postcode. pub struct PostcodeCrimeSeries { /// Index into `crime_types`. pub type_idx: u16, pub points: Vec, } pub struct CrimeByYearData { /// All crime type names in stable insertion order. pub crime_types: Vec, /// All years available for each crime type, same order as `crime_types`. pub years_by_type: Vec>, /// Postcode → all available per-type series for that postcode. pub series_by_postcode: FxHashMap>, /// Postcode → years its police force actually published data for (from /// the `covered_years` column). An EMPTY vec means the postcode's crime /// picture is unknown (force gap / unusable geometry) — it must not count /// toward any year. A postcode ABSENT from this map (legacy parquet /// without the column) is treated as covered for every year. pub covered_years_by_postcode: FxHashMap>, } impl CrimeByYearData { pub fn load(path: &Path) -> anyhow::Result { run_polars_io(|| Self::load_inner(path)) } fn load_inner(path: &Path) -> anyhow::Result { info!("Loading crime-by-year from {}", path.display()); let pl_path = PlRefPath::try_from_path(path).with_context(|| { format!( "Failed to normalize crime-by-year parquet path {}", path.display() ) })?; let df = LazyFrame::scan_parquet(pl_path, Default::default()) .with_context(|| format!("Failed to scan crime-by-year parquet at {}", path.display()))? .collect() .with_context(|| { format!("Failed to read crime-by-year parquet at {}", path.display()) })?; let postcode_col = df .column("postcode") .context("crime-by-year parquet missing 'postcode' column")? .str() .context("'postcode' column is not a string")?; let postcode_values: Vec = postcode_col .into_iter() .enumerate() .map(|(row, value)| { let value = value.with_context(|| format!("crime-by-year row {row} has null postcode"))?; let trimmed = value.trim(); if trimmed.is_empty() { bail!("crime-by-year row {row} has blank postcode"); } Ok(trimmed.to_string()) }) .collect::>>()?; // Discover crime-type columns (anything with the by-year suffix). let crime_type_cols: Vec<(String, String)> = df .get_column_names() .iter() .filter_map(|name| { let name = name.as_str(); name.strip_suffix(BY_YEAR_SUFFIX) .map(|stripped| (stripped.to_string(), name.to_string())) }) .collect(); if crime_type_cols.is_empty() { bail!( "crime-by-year parquet at {} has no '* (by year)' columns", path.display() ); } let crime_types: Vec = crime_type_cols.iter().map(|(t, _)| t.clone()).collect(); let mut series_by_postcode: FxHashMap> = FxHashMap::default(); let mut years_by_type: Vec> = Vec::with_capacity(crime_type_cols.len()); let row_count = df.height(); for (type_idx, (_, col_name)) in crime_type_cols.iter().enumerate() { let mut years_for_type = std::collections::BTreeSet::new(); let col = df .column(col_name) .with_context(|| format!("Missing crime-by-year column '{col_name}'"))?; let list_ca = col .list() .with_context(|| format!("Column '{col_name}' is not a list"))?; for (row, postcode) in postcode_values.iter().enumerate().take(row_count) { let Some(inner) = list_ca.get_as_series(row) else { continue; }; if inner.is_empty() { continue; } let structs = inner .struct_() .with_context(|| format!("Inner of '{col_name}' is not a struct"))?; let years = structs .field_by_name("year") .with_context(|| format!("Missing 'year' field in '{col_name}'"))?; let counts = structs .field_by_name("count") .with_context(|| format!("Missing 'count' field in '{col_name}'"))?; let mut points: Vec = Vec::with_capacity(inner.len()); for idx in 0..inner.len() { let yr = match years.get(idx).ok() { Some(AnyValue::Int32(y)) => y, Some(AnyValue::Int64(y)) => y as i32, _ => continue, }; let cnt = match counts.get(idx).ok() { Some(AnyValue::Float32(c)) => c, Some(AnyValue::Float64(c)) => c as f32, Some(AnyValue::Int32(c)) => c as f32, Some(AnyValue::Int64(c)) => c as f32, _ => continue, }; points.push(YearPoint { year: yr, count: cnt, }); years_for_type.insert(yr); } if points.is_empty() { continue; } points.sort_by_key(|p| p.year); series_by_postcode .entry(postcode.clone()) .or_default() .push(PostcodeCrimeSeries { type_idx: type_idx as u16, points, }); } years_by_type.push(years_for_type.into_iter().collect()); } // Force-coverage calendar (optional column: legacy parquets predate it; // their postcodes are treated as fully covered). A row with an empty // list is meaningful — zero covered years — so it IS inserted. let mut covered_years_by_postcode: FxHashMap> = FxHashMap::default(); if let Ok(col) = df.column(COVERAGE_COLUMN) { let list_ca = col .list() .with_context(|| format!("Column '{COVERAGE_COLUMN}' is not a list"))?; for (row, postcode) in postcode_values.iter().enumerate().take(row_count) { let Some(inner) = list_ca.get_as_series(row) else { // Null coverage: treat as legacy/fully covered (skip). continue; }; let mut years: Vec = Vec::with_capacity(inner.len()); if !inner.is_empty() { let structs = inner.struct_().with_context(|| { format!("Inner of '{COVERAGE_COLUMN}' is not a struct") })?; let year_field = structs.field_by_name("year").with_context(|| { format!("Missing 'year' field in '{COVERAGE_COLUMN}'") })?; for idx in 0..inner.len() { match year_field.get(idx).ok() { Some(AnyValue::Int32(y)) => years.push(y), Some(AnyValue::Int64(y)) => years.push(y as i32), _ => continue, } } } covered_years_by_postcode.insert(postcode.clone(), years); } } info!( postcodes = series_by_postcode.len(), crime_types = crime_types.len(), with_coverage = covered_years_by_postcode.len(), "Crime-by-year data loaded" ); Ok(Self { crime_types, years_by_type, series_by_postcode, covered_years_by_postcode, }) } }