180 lines
6.6 KiB
Rust
180 lines
6.6 KiB
Rust
//! Per-postcode per-crime-type per-year crime counts, loaded from a side
|
|
//! parquet and used by the right pane to plot crime-over-time. Filtering is not
|
|
//! supported — this data is display-only.
|
|
|
|
use std::path::Path;
|
|
|
|
use anyhow::{bail, Context};
|
|
use polars::lazy::frame::LazyFrame;
|
|
use polars::prelude::PlRefPath;
|
|
use polars::prelude::*;
|
|
use rustc_hash::FxHashMap;
|
|
use tracing::info;
|
|
|
|
use super::run_polars_io;
|
|
|
|
/// Suffix appended to the underlying crime-type column name in the parquet
|
|
/// (e.g. `"Burglary (by year)"`). Stripped to derive the display name.
|
|
pub const BY_YEAR_SUFFIX: &str = " (by year)";
|
|
|
|
#[derive(Clone, Copy)]
|
|
pub struct YearPoint {
|
|
pub year: i32,
|
|
pub count: f32,
|
|
}
|
|
|
|
/// One per crime type: ordered list of (year, count) for a single postcode.
|
|
pub struct PostcodeCrimeSeries {
|
|
/// Index into `crime_types`.
|
|
pub type_idx: u16,
|
|
pub points: Vec<YearPoint>,
|
|
}
|
|
|
|
pub struct CrimeByYearData {
|
|
/// All crime type names in stable insertion order.
|
|
pub crime_types: Vec<String>,
|
|
/// All years available for each crime type, same order as `crime_types`.
|
|
pub years_by_type: Vec<Vec<i32>>,
|
|
/// Postcode → all available per-type series for that postcode.
|
|
pub series_by_postcode: FxHashMap<String, Vec<PostcodeCrimeSeries>>,
|
|
}
|
|
|
|
impl CrimeByYearData {
|
|
pub fn load(path: &Path) -> anyhow::Result<Self> {
|
|
run_polars_io(|| Self::load_inner(path))
|
|
}
|
|
|
|
fn load_inner(path: &Path) -> anyhow::Result<Self> {
|
|
info!("Loading crime-by-year from {}", path.display());
|
|
let pl_path = PlRefPath::try_from_path(path).with_context(|| {
|
|
format!(
|
|
"Failed to normalize crime-by-year parquet path {}",
|
|
path.display()
|
|
)
|
|
})?;
|
|
let df = LazyFrame::scan_parquet(pl_path, Default::default())
|
|
.with_context(|| format!("Failed to scan crime-by-year parquet at {}", path.display()))?
|
|
.collect()
|
|
.with_context(|| {
|
|
format!("Failed to read crime-by-year parquet at {}", path.display())
|
|
})?;
|
|
|
|
let postcode_col = df
|
|
.column("postcode")
|
|
.context("crime-by-year parquet missing 'postcode' column")?
|
|
.str()
|
|
.context("'postcode' column is not a string")?;
|
|
let postcode_values: Vec<String> = postcode_col
|
|
.into_iter()
|
|
.enumerate()
|
|
.map(|(row, value)| {
|
|
let value =
|
|
value.with_context(|| format!("crime-by-year row {row} has null postcode"))?;
|
|
let trimmed = value.trim();
|
|
if trimmed.is_empty() {
|
|
bail!("crime-by-year row {row} has blank postcode");
|
|
}
|
|
Ok(trimmed.to_string())
|
|
})
|
|
.collect::<anyhow::Result<Vec<_>>>()?;
|
|
|
|
// Discover crime-type columns (anything with the by-year suffix).
|
|
let crime_type_cols: Vec<(String, String)> = df
|
|
.get_column_names()
|
|
.iter()
|
|
.filter_map(|name| {
|
|
let name = name.as_str();
|
|
name.strip_suffix(BY_YEAR_SUFFIX)
|
|
.map(|stripped| (stripped.to_string(), name.to_string()))
|
|
})
|
|
.collect();
|
|
|
|
if crime_type_cols.is_empty() {
|
|
bail!(
|
|
"crime-by-year parquet at {} has no '* (by year)' columns",
|
|
path.display()
|
|
);
|
|
}
|
|
|
|
let crime_types: Vec<String> = crime_type_cols.iter().map(|(t, _)| t.clone()).collect();
|
|
|
|
let mut series_by_postcode: FxHashMap<String, Vec<PostcodeCrimeSeries>> =
|
|
FxHashMap::default();
|
|
let mut years_by_type: Vec<Vec<i32>> = Vec::with_capacity(crime_type_cols.len());
|
|
let row_count = df.height();
|
|
|
|
for (type_idx, (_, col_name)) in crime_type_cols.iter().enumerate() {
|
|
let mut years_for_type = std::collections::BTreeSet::new();
|
|
let col = df
|
|
.column(col_name)
|
|
.with_context(|| format!("Missing crime-by-year column '{col_name}'"))?;
|
|
let list_ca = col
|
|
.list()
|
|
.with_context(|| format!("Column '{col_name}' is not a list"))?;
|
|
|
|
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
|
|
let Some(inner) = list_ca.get_as_series(row) else {
|
|
continue;
|
|
};
|
|
if inner.is_empty() {
|
|
continue;
|
|
}
|
|
let structs = inner
|
|
.struct_()
|
|
.with_context(|| format!("Inner of '{col_name}' is not a struct"))?;
|
|
let years = structs
|
|
.field_by_name("year")
|
|
.with_context(|| format!("Missing 'year' field in '{col_name}'"))?;
|
|
let counts = structs
|
|
.field_by_name("count")
|
|
.with_context(|| format!("Missing 'count' field in '{col_name}'"))?;
|
|
|
|
let mut points: Vec<YearPoint> = Vec::with_capacity(inner.len());
|
|
for idx in 0..inner.len() {
|
|
let yr = match years.get(idx).ok() {
|
|
Some(AnyValue::Int32(y)) => y,
|
|
Some(AnyValue::Int64(y)) => y as i32,
|
|
_ => continue,
|
|
};
|
|
let cnt = match counts.get(idx).ok() {
|
|
Some(AnyValue::Float32(c)) => c,
|
|
Some(AnyValue::Float64(c)) => c as f32,
|
|
Some(AnyValue::Int32(c)) => c as f32,
|
|
Some(AnyValue::Int64(c)) => c as f32,
|
|
_ => continue,
|
|
};
|
|
points.push(YearPoint {
|
|
year: yr,
|
|
count: cnt,
|
|
});
|
|
years_for_type.insert(yr);
|
|
}
|
|
if points.is_empty() {
|
|
continue;
|
|
}
|
|
points.sort_by_key(|p| p.year);
|
|
|
|
series_by_postcode
|
|
.entry(postcode.clone())
|
|
.or_default()
|
|
.push(PostcodeCrimeSeries {
|
|
type_idx: type_idx as u16,
|
|
points,
|
|
});
|
|
}
|
|
years_by_type.push(years_for_type.into_iter().collect());
|
|
}
|
|
|
|
info!(
|
|
postcodes = series_by_postcode.len(),
|
|
crime_types = crime_types.len(),
|
|
"Crime-by-year data loaded"
|
|
);
|
|
|
|
Ok(Self {
|
|
crime_types,
|
|
years_by_type,
|
|
series_by_postcode,
|
|
})
|
|
}
|
|
}
|