Fix data pipelines once and for all

This commit is contained in:
Andras Schmelczer 2026-06-10 21:27:32 +01:00
parent 08560476c5
commit 4012e4e047
46 changed files with 4508 additions and 855 deletions

View file

@ -55,6 +55,7 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
&[
"Rail station",
"Tube station",
"Tram & Metro stop",
"Bus station",
"Bus stop",
"Airport",
@ -79,7 +80,7 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
),
(
"Health",
&["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
&["GP Surgery", "Pharmacy", "Dentist", "Hospital", "Clinic"],
),
(
"Leisure",

View file

@ -180,20 +180,6 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
raw: false,
absolute: true,
}),
Feature::Enum(EnumFeatureConfig {
name: "Within conservation area",
order: Some(&["Yes", "No"]),
description: "Whether the postcode point falls inside a designated conservation area",
detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.",
source: "conservation-areas",
}),
Feature::Enum(EnumFeatureConfig {
name: "Listed building",
order: Some(&["Yes", "No"]),
description: "Whether this property appears to match a Historic England listed building entry",
detail: "Historic England National Heritage List for England listed-building points, matched conservatively to property addresses using the listed-entry name and nearby postcode candidates. Treat this as a screening signal, not a legal determination: verify any specific property on the NHLE and with the local planning authority.",
source: "listed-buildings",
}),
Feature::Numeric(FeatureConfig {
name: "Noise (dB)",
bounds: Bounds::Fixed {
@ -209,6 +195,20 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
raw: false,
absolute: false,
}),
Feature::Enum(EnumFeatureConfig {
name: "Within conservation area",
order: Some(&["Yes", "No"]),
description: "Whether the postcode point falls inside a designated conservation area",
detail: "Planning Data conservation area boundaries, matched to the postcode representative point. The national dataset is a work in progress and may include duplicates or incomplete local coverage, so boundary-sensitive decisions should be checked with the local planning authority.",
source: "conservation-areas",
}),
Feature::Enum(EnumFeatureConfig {
name: "Listed building",
order: Some(&["Yes", "No"]),
description: "Whether this property appears to match a Historic England listed building entry",
detail: "Historic England National Heritage List for England listed-building points, matched conservatively to property addresses using the listed-entry name and nearby postcode candidates. Treat this as a screening signal, not a legal determination: verify any specific property on the NHLE and with the local planning authority.",
source: "listed-buildings",
}),
],
},
FeatureGroup {
@ -307,89 +307,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
name: "Schools",
features: &[
Feature::Numeric(FeatureConfig {
name: "Good+ primary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 10.0,
},
step: 1.0,
description: "Primary schools rated Good or Outstanding by Ofsted within 2km",
detail: "State-funded primary schools within 2km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Good+ secondary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 5.0,
},
step: 1.0,
description: "Secondary schools rated Good or Outstanding by Ofsted within 2km",
detail: "State-funded secondary schools within 2km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding primary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 10.0,
},
step: 1.0,
description: "Primary schools rated Outstanding by Ofsted within 2km",
detail: "State-funded primary schools within 2km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding secondary schools within 2km",
bounds: Bounds::Fixed {
min: 0.0,
max: 5.0,
},
step: 1.0,
description: "Secondary schools rated Outstanding by Ofsted within 2km",
detail: "State-funded secondary schools within 2km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Good+ primary schools within 5km",
bounds: Bounds::Fixed {
min: 0.0,
max: 30.0,
},
step: 1.0,
description: "Primary schools rated Good or Outstanding by Ofsted within 5km",
detail: "State-funded primary schools within 5km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Good+ secondary schools within 5km",
name: "Good+ primary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 15.0,
},
step: 1.0,
description: "Secondary schools rated Good or Outstanding by Ofsted within 5km",
detail: "State-funded secondary schools within 5km with a current Ofsted rating of Good or Outstanding. Schools not yet inspected are excluded.",
description: "Primary schools rated Good or Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded primary schools with a current Ofsted rating of Good or Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
@ -397,14 +322,14 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding primary schools within 5km",
name: "Good+ secondary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 30.0,
max: 11.0,
},
step: 1.0,
description: "Primary schools rated Outstanding by Ofsted within 5km",
detail: "State-funded primary schools within 5km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
description: "Secondary schools rated Good or Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded secondary schools with a current Ofsted rating of Good or Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
@ -412,14 +337,29 @@ pub static FEATURE_GROUPS: &[FeatureGroup] = &[
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding secondary schools within 5km",
name: "Outstanding primary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 15.0,
max: 8.0,
},
step: 1.0,
description: "Secondary schools rated Outstanding by Ofsted within 5km",
detail: "State-funded secondary schools within 5km with a current Ofsted rating of Outstanding. Schools not yet inspected are excluded.",
description: "Primary schools rated Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded primary schools with a current Ofsted rating of Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",
raw: false,
absolute: false,
}),
Feature::Numeric(FeatureConfig {
name: "Outstanding secondary school catchments",
bounds: Bounds::Fixed {
min: 0.0,
max: 4.0,
},
step: 1.0,
description: "Secondary schools rated Outstanding whose modelled catchment area covers this postcode",
detail: "How many state-funded secondary schools with a current Ofsted rating of Outstanding draw their pupils from an area covering this postcode. Catchment radii are modelled by simulating England's distance-based admissions (each school's places against the local child population, Census 2021) and calibrated against published 'last distance offered' figures; they are estimates, not official admission areas. Schools not yet inspected are excluded.",
source: "ofsted",
prefix: "",
suffix: "",

View file

@ -62,6 +62,42 @@ pub struct AiFiltersResponse {
notes: String,
/// Number of properties matching the proposed property and travel time filters.
match_count: usize,
/// Bounding box of the matching properties so the client can move the
/// camera to where matches actually are. Absent when nothing matches.
#[serde(skip_serializing_if = "Option::is_none")]
match_bounds: Option<MatchBounds>,
}
#[derive(Serialize)]
pub struct MatchBounds {
south: f32,
west: f32,
north: f32,
east: f32,
}
/// Bounding box over matched coordinates, trimmed to the 5th95th percentile
/// per axis (when there are enough points) so a handful of remote outliers
/// doesn't zoom the camera out to all of England.
fn percentile_trimmed_bounds(mut lats: Vec<f32>, mut lons: Vec<f32>) -> Option<MatchBounds> {
if lats.is_empty() || lats.len() != lons.len() {
return None;
}
lats.sort_unstable_by(f32::total_cmp);
lons.sort_unstable_by(f32::total_cmp);
let last = lats.len() - 1;
let (lo, hi) = if lats.len() >= 20 {
let trim = lats.len() / 20;
(trim, last - trim)
} else {
(0, last)
};
Some(MatchBounds {
south: lats[lo],
north: lats[hi],
west: lons[lo],
east: lons[hi],
})
}
/// Strip markdown code fences (```json ... ``` or ``` ... ```) from LLM output.
@ -90,17 +126,12 @@ fn school_feature_name_from_key(name: &str) -> Option<&'static str> {
let mut parts = rest.split(':');
let phase = parts.next()?;
let rating = parts.next()?;
let distance = parts.next()?;
match (phase, rating, distance) {
("primary", "good", "2") => Some("Good+ primary schools within 2km"),
("secondary", "good", "2") => Some("Good+ secondary schools within 2km"),
("primary", "outstanding", "2") => Some("Outstanding primary schools within 2km"),
("secondary", "outstanding", "2") => Some("Outstanding secondary schools within 2km"),
("primary", "good", "5") => Some("Good+ primary schools within 5km"),
("secondary", "good", "5") => Some("Good+ secondary schools within 5km"),
("primary", "outstanding", "5") => Some("Outstanding primary schools within 5km"),
("secondary", "outstanding", "5") => Some("Outstanding secondary schools within 5km"),
match (phase, rating) {
("primary", "good") => Some("Good+ primary school catchments"),
("secondary", "good") => Some("Good+ secondary school catchments"),
("primary", "outstanding") => Some("Outstanding primary school catchments"),
("secondary", "outstanding") => Some("Outstanding secondary school catchments"),
_ => None,
}
}
@ -508,8 +539,8 @@ pub fn build_system_prompt(
{\"name\": \"Serious crime (avg/yr)\", \"bound\": \"max\", \"value\": 5}, \
{\"name\": \"Minor crime (avg/yr)\", \"bound\": \"max\", \"value\": 20}, \
{\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary school catchments\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Number of amenities (Park) within 2km\", \"bound\": \"min\", \"value\": 3}], \
\"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
.to_string(),
@ -519,8 +550,8 @@ pub fn build_system_prompt(
"\nUser: \"quiet area with outstanding schools\"\n\
Output: {\"numeric_filters\": [\
{\"name\": \"Noise (dB)\", \"bound\": \"max\", \"value\": 55}, \
{\"name\": \"Outstanding primary schools within 2km\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Outstanding secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
{\"name\": \"Outstanding primary school catchments\", \"bound\": \"min\", \"value\": 1}, \
{\"name\": \"Outstanding secondary school catchments\", \"bound\": \"min\", \"value\": 1}], \
\"enum_filters\": [], \"travel_time_filters\": [], \"notes\": \"\"}"
.to_string(),
);
@ -557,8 +588,8 @@ pub fn build_system_prompt(
Output: {\"numeric_filters\": [\
{\"name\": \"Total floor area (sqm)\", \"bound\": \"min\", \"value\": 100}, \
{\"name\": \"Number of bedrooms & living rooms\", \"bound\": \"min\", \"value\": 5}, \
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary schools within 2km\", \"bound\": \"min\", \"value\": 1}], \
{\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}, \
{\"name\": \"Good+ secondary school catchments\", \"bound\": \"min\", \"value\": 1}], \
\"enum_filters\": [{\"name\": \"Property type\", \
\"values\": [\"Detached\", \"Semi-Detached\"]}], \
\"travel_time_filters\": [{\"mode\": \"car\", \"slug\": \"manchester\", \
@ -592,7 +623,7 @@ pub fn build_system_prompt(
"\nUser: \"3 bed house under 500k with good schools\"\n\
Output: {\
\"numeric_filters\": [{\"name\": \"Estimated current price\", \"bound\": \"max\", \"value\": 500000}, \
{\"name\": \"Good+ primary schools within 2km\", \"bound\": \"min\", \"value\": 2}], \
{\"name\": \"Good+ primary school catchments\", \"bound\": \"min\", \"value\": 2}], \
\"enum_filters\": [{\"name\": \"Property type\", \
\"values\": [\"Detached\", \"Semi-Detached\", \"Terraced\"]}], \
\"travel_time_filters\": [], \
@ -759,7 +790,7 @@ fn count_matching_rows(
state: &AppState,
filters: &Value,
travel_time_filters: &[TravelTimeFilter],
) -> usize {
) -> (usize, Option<MatchBounds>) {
let filter_str = filters_to_filter_string(filters);
let quant = state.data.quant_ref();
@ -778,7 +809,7 @@ fn count_matching_rows(
Ok(f) => f,
Err(err) => {
warn!("Failed to parse filters for match count: {err}");
return 0;
return (0, None);
}
}
};
@ -800,6 +831,8 @@ fn count_matching_rows(
let has_poi_filters = !parsed_poi_filters.is_empty();
let mut count = 0usize;
let mut matched_lats: Vec<f32> = Vec::new();
let mut matched_lons: Vec<f32> = Vec::new();
for (row, pc_key) in pc_keys.iter().enumerate().take(num_rows) {
if !row_passes_filters(
row,
@ -836,9 +869,11 @@ fn count_matching_rows(
}
count += 1;
matched_lats.push(state.data.lat[row]);
matched_lons.push(state.data.lon[row]);
}
count
(count, percentile_trimmed_bounds(matched_lats, matched_lons))
}
/// Budget limits for the Gemini conversation loop. Separate counters prevent
@ -1132,7 +1167,7 @@ pub async fn post_ai_filters(
.to_string();
// Count matching properties and refine if too restrictive
let match_count = count_matching_rows(&state, &filters, &travel_time_filters);
let (match_count, match_bounds) = count_matching_rows(&state, &filters, &travel_time_filters);
info!(
match_count = match_count,
round = round,
@ -1173,6 +1208,7 @@ pub async fn post_ai_filters(
travel_time_filters,
notes,
match_count: 0,
match_bounds: None,
}));
}
@ -1236,6 +1272,7 @@ pub async fn post_ai_filters(
travel_time_filters,
notes,
match_count,
match_bounds,
}));
}
@ -1488,9 +1525,14 @@ mod tests {
#[test]
fn synthetic_filter_keys_are_normalized_to_backend_names() {
assert_eq!(
canonical_filter_name("Schools:primary:good:0"),
"Good+ primary school catchments"
);
// Legacy keys still carry a distance segment; it is ignored.
assert_eq!(
canonical_filter_name("Schools:primary:good:2:0"),
"Good+ primary schools within 2km"
"Good+ primary school catchments"
);
assert_eq!(
canonical_filter_name("Specific crimes:Burglary%20%28avg%2Fyr%29:1"),

View file

@ -68,8 +68,11 @@ pub async fn get_filter_counts(
let num_total_filters = num_regular + travel_filter_indices.len();
if num_total_filters == 0 {
// With no active filters the total is simply every property in bounds.
// count_in_bounds is O(grid cells), far cheaper than walking every row.
let total = state.grid.count_in_bounds(south, west, north, east) as u32;
return Ok(Json(FilterCountsResponse {
total: 0,
total,
impacts: FxHashMap::default(),
}));
}