Fable findings in data

This commit is contained in:
Andras Schmelczer 2026-06-11 07:49:23 +01:00
parent b98bc6d611
commit 6a33b03fdf
20 changed files with 1502 additions and 274 deletions

View file

@ -17,6 +17,14 @@ use super::run_polars_io;
/// (e.g. `"Burglary (by year)"`). Stripped to derive the display name.
pub const BY_YEAR_SUFFIX: &str = " (by year)";
/// Per-postcode police-force coverage calendar column: `list[struct{year,
/// months}]` of the years the postcode's home force published enough months.
/// police.uk has multi-year publication gaps for whole forces (e.g. Greater
/// Manchester 2019-07 onwards), and a missing year is *no data*, not zero
/// crime — consumers must exclude uncovered (postcode, year)s instead of
/// charting them as zeros.
pub const COVERAGE_COLUMN: &str = "covered_years";
#[derive(Clone, Copy)]
pub struct YearPoint {
pub year: i32,
@ -37,6 +45,12 @@ pub struct CrimeByYearData {
pub years_by_type: Vec<Vec<i32>>,
/// Postcode → all available per-type series for that postcode.
pub series_by_postcode: FxHashMap<String, Vec<PostcodeCrimeSeries>>,
/// Postcode → years its police force actually published data for (from
/// the `covered_years` column). An EMPTY vec means the postcode's crime
/// picture is unknown (force gap / unusable geometry) — it must not count
/// toward any year. A postcode ABSENT from this map (legacy parquet
/// without the column) is treated as covered for every year.
pub covered_years_by_postcode: FxHashMap<String, Vec<i32>>,
}
impl CrimeByYearData {
@ -165,9 +179,44 @@ impl CrimeByYearData {
years_by_type.push(years_for_type.into_iter().collect());
}
// Force-coverage calendar (optional column: legacy parquets predate it;
// their postcodes are treated as fully covered). A row with an empty
// list is meaningful — zero covered years — so it IS inserted.
let mut covered_years_by_postcode: FxHashMap<String, Vec<i32>> =
FxHashMap::default();
if let Ok(col) = df.column(COVERAGE_COLUMN) {
let list_ca = col
.list()
.with_context(|| format!("Column '{COVERAGE_COLUMN}' is not a list"))?;
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
let Some(inner) = list_ca.get_as_series(row) else {
// Null coverage: treat as legacy/fully covered (skip).
continue;
};
let mut years: Vec<i32> = Vec::with_capacity(inner.len());
if !inner.is_empty() {
let structs = inner.struct_().with_context(|| {
format!("Inner of '{COVERAGE_COLUMN}' is not a struct")
})?;
let year_field = structs.field_by_name("year").with_context(|| {
format!("Missing 'year' field in '{COVERAGE_COLUMN}'")
})?;
for idx in 0..inner.len() {
match year_field.get(idx).ok() {
Some(AnyValue::Int32(y)) => years.push(y),
Some(AnyValue::Int64(y)) => years.push(y as i32),
_ => continue,
}
}
}
covered_years_by_postcode.insert(postcode.clone(), years);
}
}
info!(
postcodes = series_by_postcode.len(),
crime_types = crime_types.len(),
with_coverage = covered_years_by_postcode.len(),
"Crime-by-year data loaded"
);
@ -175,6 +224,7 @@ impl CrimeByYearData {
crime_types,
years_by_type,
series_by_postcode,
covered_years_by_postcode,
})
}
}