Fable findings in data

This commit is contained in:
Andras Schmelczer 2026-06-11 07:49:23 +01:00
parent b98bc6d611
commit 6a33b03fdf
20 changed files with 1502 additions and 274 deletions

View file

@ -17,6 +17,14 @@ use super::run_polars_io;
/// (e.g. `"Burglary (by year)"`). Stripped to derive the display name.
pub const BY_YEAR_SUFFIX: &str = " (by year)";
/// Per-postcode police-force coverage calendar column: `list[struct{year,
/// months}]` of the years the postcode's home force published enough months.
/// police.uk has multi-year publication gaps for whole forces (e.g. Greater
/// Manchester 2019-07 onwards), and a missing year is *no data*, not zero
/// crime — consumers must exclude uncovered (postcode, year)s instead of
/// charting them as zeros.
pub const COVERAGE_COLUMN: &str = "covered_years";
#[derive(Clone, Copy)]
pub struct YearPoint {
pub year: i32,
@ -37,6 +45,12 @@ pub struct CrimeByYearData {
pub years_by_type: Vec<Vec<i32>>,
/// Postcode → all available per-type series for that postcode.
pub series_by_postcode: FxHashMap<String, Vec<PostcodeCrimeSeries>>,
/// Postcode → years its police force actually published data for (from
/// the `covered_years` column). An EMPTY vec means the postcode's crime
/// picture is unknown (force gap / unusable geometry) — it must not count
/// toward any year. A postcode ABSENT from this map (legacy parquet
/// without the column) is treated as covered for every year.
pub covered_years_by_postcode: FxHashMap<String, Vec<i32>>,
}
impl CrimeByYearData {
@ -165,9 +179,44 @@ impl CrimeByYearData {
years_by_type.push(years_for_type.into_iter().collect());
}
// Force-coverage calendar (optional column: legacy parquets predate it;
// their postcodes are treated as fully covered). A row with an empty
// list is meaningful — zero covered years — so it IS inserted.
let mut covered_years_by_postcode: FxHashMap<String, Vec<i32>> =
FxHashMap::default();
if let Ok(col) = df.column(COVERAGE_COLUMN) {
let list_ca = col
.list()
.with_context(|| format!("Column '{COVERAGE_COLUMN}' is not a list"))?;
for (row, postcode) in postcode_values.iter().enumerate().take(row_count) {
let Some(inner) = list_ca.get_as_series(row) else {
// Null coverage: treat as legacy/fully covered (skip).
continue;
};
let mut years: Vec<i32> = Vec::with_capacity(inner.len());
if !inner.is_empty() {
let structs = inner.struct_().with_context(|| {
format!("Inner of '{COVERAGE_COLUMN}' is not a struct")
})?;
let year_field = structs.field_by_name("year").with_context(|| {
format!("Missing 'year' field in '{COVERAGE_COLUMN}'")
})?;
for idx in 0..inner.len() {
match year_field.get(idx).ok() {
Some(AnyValue::Int32(y)) => years.push(y),
Some(AnyValue::Int64(y)) => years.push(y as i32),
_ => continue,
}
}
}
covered_years_by_postcode.insert(postcode.clone(), years);
}
}
info!(
postcodes = series_by_postcode.len(),
crime_types = crime_types.len(),
with_coverage = covered_years_by_postcode.len(),
"Crime-by-year data loaded"
);
@ -175,6 +224,7 @@ impl CrimeByYearData {
crime_types,
years_by_type,
series_by_postcode,
covered_years_by_postcode,
})
}
}

View file

@ -474,7 +474,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Aggregate of serious crime categories per year",
detail: "Sum of violence, robbery, burglary, and weapons possession per year within 50m of the postcode, counted from police.uk street-level crime points (anonymised, snapped to nearby map points). Provides a single serious crime metric.",
detail: "Sum of violence, robbery, burglary, and weapons possession per year near the postcode, counted from police.uk street-level crime points (anonymised, snapped to nearby map points). This is an area-normalised incident density for the surrounding streets, not a per-resident risk: busy commercial centres rank high however few people live there. Averaged over the months the local police force actually published data; known force gaps (e.g. Greater Manchester since mid-2019) are excluded rather than counted as zero crime.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -489,7 +489,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Aggregate of minor crime categories per year",
detail: "Sum of anti-social behaviour, shoplifting, bicycle theft, and other lower-severity crime per year within 50m of the postcode, counted from police.uk street-level crime points (anonymised, snapped to nearby map points). Provides a single minor crime metric.",
detail: "Sum of anti-social behaviour, shoplifting, bicycle theft, and other lower-severity crime per year near the postcode, counted from police.uk street-level crime points (anonymised, snapped to nearby map points). This is an area-normalised incident density for the surrounding streets, not a per-resident risk: busy commercial centres rank high however few people live there. Averaged over the months the local police force actually published data; known force gaps (e.g. Greater Manchester since mid-2019) are excluded rather than counted as zero crime.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -504,7 +504,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly violent and sexual offences in the area",
detail: "Average number of violence and sexual offences per year within 50m of the postcode, from police.uk street-level crime data. Includes assault, harassment, and sexual offences.",
detail: "Average number of violence and sexual offences per year near the postcode, from police.uk street-level crime data. Includes assault, harassment, and sexual offences.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -519,7 +519,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly burglary offences in the area",
detail: "Average number of burglary offences per year within 50m of the postcode, from police.uk street-level crime data. Includes residential and commercial burglary.",
detail: "Average number of burglary offences per year near the postcode, from police.uk street-level crime data. Includes residential and commercial burglary.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -534,7 +534,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly robbery offences in the area",
detail: "Average number of robbery offences per year within 50m of the postcode, from police.uk street-level crime data. Robbery involves theft with force or threat of force.",
detail: "Average number of robbery offences per year near the postcode, from police.uk street-level crime data. Robbery involves theft with force or threat of force.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -549,7 +549,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly vehicle crime in the area",
detail: "Average number of vehicle crime incidents per year within 50m of the postcode, from police.uk street-level crime data. Includes theft of and from vehicles.",
detail: "Average number of vehicle crime incidents per year near the postcode, from police.uk street-level crime data. Includes theft of and from vehicles.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -564,7 +564,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly anti-social behaviour incidents in the area",
detail: "Average number of anti-social behaviour incidents per year within 50m of the postcode, from police.uk street-level crime data. Includes nuisance, environmental, and personal anti-social behaviour.",
detail: "Average number of anti-social behaviour incidents per year near the postcode, from police.uk street-level crime data. Includes nuisance, environmental, and personal anti-social behaviour.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -579,7 +579,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly criminal damage and arson in the area",
detail: "Average number of criminal damage and arson incidents per year within 50m of the postcode, from police.uk street-level crime data.",
detail: "Average number of criminal damage and arson incidents per year near the postcode, from police.uk street-level crime data.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -594,7 +594,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly other theft offences in the area",
detail: "Average number of 'other theft' offences per year within 50m of the postcode, from police.uk street-level crime data. Includes theft not classified under burglary, vehicle crime, shoplifting, or bicycle theft.",
detail: "Average number of 'other theft' offences per year near the postcode, from police.uk street-level crime data. Includes theft not classified under burglary, vehicle crime, shoplifting, or bicycle theft.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -609,7 +609,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly theft from the person in the area",
detail: "Average number of theft from the person offences per year within 50m of the postcode, from police.uk street-level crime data. Includes pickpocketing and bag snatching without force.",
detail: "Average number of theft from the person offences per year near the postcode, from police.uk street-level crime data. Includes pickpocketing and bag snatching without force.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -624,7 +624,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly shoplifting offences in the area",
detail: "Average number of shoplifting offences per year within 50m of the postcode, from police.uk street-level crime data.",
detail: "Average number of shoplifting offences per year near the postcode, from police.uk street-level crime data.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -639,7 +639,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly bicycle theft in the area",
detail: "Average number of bicycle theft offences per year within 50m of the postcode, from police.uk street-level crime data.",
detail: "Average number of bicycle theft offences per year near the postcode, from police.uk street-level crime data.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -654,7 +654,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly drug offences in the area",
detail: "Average number of drug offences per year within 50m of the postcode, from police.uk street-level crime data. Includes possession and trafficking offences.",
detail: "Average number of drug offences per year near the postcode, from police.uk street-level crime data. Includes possession and trafficking offences.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -669,7 +669,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly weapons possession offences in the area",
detail: "Average number of possession of weapons offences per year within 50m of the postcode, from police.uk street-level crime data.",
detail: "Average number of possession of weapons offences per year near the postcode, from police.uk street-level crime data.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -684,7 +684,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly public order offences in the area",
detail: "Average number of public order offences per year within 50m of the postcode, from police.uk street-level crime data. Includes causing fear, alarm, or distress.",
detail: "Average number of public order offences per year near the postcode, from police.uk street-level crime data. Includes causing fear, alarm, or distress.",
source: "crime",
prefix: "",
suffix: "/yr",
@ -699,7 +699,7 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
},
step: 1.0,
description: "Average yearly other crime in the area",
detail: "Average number of other crime offences per year within 50m of the postcode, from police.uk street-level crime data. A catch-all category for offences not classified elsewhere.",
detail: "Average number of other crime offences per year near the postcode, from police.uk street-level crime data. A catch-all category for offences not classified elsewhere.",
source: "crime",
prefix: "",
suffix: "/yr",

View file

@ -391,7 +391,7 @@ pub fn build_system_prompt(
- Use EXACT feature names from the list spelling, capitalisation, and punctuation must match.\n\
- \"cheap\" / \"affordable\" = lower price range. \"expensive\" = higher price range.\n\
- \"low crime\" / \"safe\" = low values on the Serious crime (avg/yr) and Minor crime (avg/yr) \
features (incidents counted within 50m of the postcode). Prefer these aggregates for broad \
features (area-normalised incident density near the postcode). Prefer these aggregates for broad \
area safety; use specific crime features only when the user names a crime type.\n\
- \"quiet\" = low Noise (dB). \"green\" / \"near parks\" = high Number of amenities (Park) within 2km \
or low Distance to nearest park (km), depending on wording.\n\
@ -1167,7 +1167,8 @@ pub async fn post_ai_filters(
.to_string();
// Count matching properties and refine if too restrictive
let (match_count, match_bounds) = count_matching_rows(&state, &filters, &travel_time_filters);
let (match_count, match_bounds) =
count_matching_rows(&state, &filters, &travel_time_filters);
info!(
match_count = match_count,
round = round,

View file

@ -258,10 +258,17 @@ pub fn compute_feature_stats(
/// Compute property-weighted per-year crime means across the selection.
///
/// Each matching property contributes its postcode's per-year counts (incidents
/// within 50m of that postcode); this is the same property-weighted-average
/// shape used elsewhere in the right pane. Postcodes with no series for a given
/// crime type contribute 0 for that type (matching how the `(avg/yr)` columns
/// treat missing crime types).
/// near that postcode); this is the same property-weighted-average shape used
/// elsewhere in the right pane.
///
/// Denominators are COVERAGE-AWARE: police.uk has multi-year publication gaps
/// for whole forces (e.g. Greater Manchester from 2019-07), and the pipeline
/// emits a `covered_years` calendar per postcode. A postcode only counts toward
/// a year's denominator if its force published that year — and only then does
/// its missing bar mean a genuine zero. Years no selected postcode covers are
/// omitted entirely (charted as gaps, not zeros). Postcodes without coverage
/// info (legacy parquet without the column) count toward every year, restoring
/// the previous behaviour.
pub fn compute_crime_by_year(
matching_rows: &[usize],
data: &PropertyData,
@ -273,27 +280,34 @@ pub fn compute_crime_by_year(
return Vec::new();
}
// For each crime type, accumulate per-year sums and the count of rows whose
// postcode exists in the crime side table.
let num_types = crime_by_year.crime_types.len();
let mut per_type_year_sums: Vec<FxHashMap<i32, f64>> =
(0..num_types).map(|_| FxHashMap::default()).collect();
let mut per_type_row_counts: Vec<u32> = vec![0; num_types];
// Per-year denominator parts: rows whose coverage calendar includes the
// year, plus rows with no calendar at all (legacy: covered everywhere).
let mut covered_counts: FxHashMap<i32, u32> = FxHashMap::default();
let mut fully_covered_rows: u32 = 0;
for &row in matching_rows {
let postcode = data.postcode(row);
// A postcode absent from the by-year table has no recorded crime within
// 50m, so it contributes 0 to every type's per-year sum. It must still be
// counted in the denominator: the matching `(avg/yr)` stat counts those
// same zero-crime postcodes as 0.0 (crime_by_postcode.parquet has a dense
// row for every boundary postcode), so excluding them here would compute
// the chart over a smaller population and report a higher magnitude than
// the headline. Property postcodes are guaranteed to be boundary
// postcodes by the postcode-boundary-match validation, so "absent" means
// genuinely zero-crime, not missing data.
match crime_by_year.covered_years_by_postcode.get(postcode) {
Some(years) => {
// An empty list (force gap for the whole window / unusable
// boundary geometry) adds nothing: the postcode's crime
// picture is unknown and must not dilute any year's mean.
for &year in years {
*covered_counts.entry(year).or_insert(0) += 1;
}
}
None => fully_covered_rows += 1,
}
// A postcode with a row but no series for a given type had no recorded
// incidents of that type: it contributes 0 to the sums, and its covered
// years still count in the denominator — a genuine zero. Uncovered
// years are excluded via the denominators instead.
if let Some(series_list) = crime_by_year.series_by_postcode.get(postcode) {
// For every type the postcode reports, add its per-year counts.
for series in series_list {
let acc = &mut per_type_year_sums[series.type_idx as usize];
for point in &series.points {
@ -301,9 +315,6 @@ pub fn compute_crime_by_year(
}
}
}
for c in per_type_row_counts.iter_mut() {
*c += 1;
}
}
let mut out = Vec::new();
@ -317,10 +328,6 @@ pub fn compute_crime_by_year(
continue;
}
}
let row_count = per_type_row_counts[type_idx];
if row_count == 0 {
continue;
}
let years = crime_by_year
.years_by_type
.get(type_idx)
@ -329,15 +336,26 @@ pub fn compute_crime_by_year(
if years.is_empty() {
continue;
}
let denom = row_count as f64;
let sums = &per_type_year_sums[type_idx];
let points: Vec<CrimeYearPoint> = years
.iter()
.map(|&year| CrimeYearPoint {
year,
count: (sums.get(&year).copied().unwrap_or(0.0) / denom) as f32,
.filter_map(|&year| {
let denom = fully_covered_rows
+ covered_counts.get(&year).copied().unwrap_or(0);
if denom == 0 {
// No selected postcode has published data for this year.
return None;
}
Some(CrimeYearPoint {
year,
count: (sums.get(&year).copied().unwrap_or(0.0) / denom as f64)
as f32,
})
})
.collect();
if points.is_empty() {
continue;
}
out.push(CrimeYearStats {
name: name.clone(),
points,