alright

2026-05-26 19:45:13 +01:00 · 2026-05-26 19:45:13 +01:00 · 39ef5c6646
commit 39ef5c6646
parent c645b0f1d4
79 changed files with 5660 additions and 2199 deletions
--- a/server-rs/src/data/actual_listings.rs
+++ b/server-rs/src/data/actual_listings.rs
@ -61,6 +61,9 @@ pub struct ActualListingData {
    /// overlaid where available. This lets the listings endpoint use the same filter
    /// execution path as the property endpoints.
    pub filter_feature_data: Vec<u16>,
+    /// Row-major dynamic postcode POI metrics aligned with
+    /// PropertyData::poi_metrics.feature_names.
+    pub poi_filter_feature_data: Vec<u16>,
    pub grid: GridIndex,
 }

@ -109,16 +112,16 @@ impl ActualListingData {
        let listing_status = InternedColumn::build(&opt_to_string(&listing_status_raw));

        let filter_feature_data = build_filter_feature_data(
+            &df,
            property_data,
-            &postcode,
-            &address,
            &property_type_raw,
            &leasehold_freehold_raw,
            &rooms_total,
            &floor_area_sqm,
            &asking_price,
            &asking_price_per_sqm,
-        );
+        )?;
+        let poi_filter_feature_data = build_poi_filter_feature_data(&df, property_data)?;

        let grid = GridIndex::build(&lat, &lon, GRID_CELL_SIZE);

@ -144,6 +147,7 @@ impl ActualListingData {
            listing_date_iso,
            features,
            filter_feature_data,
+            poi_filter_feature_data,
            grid,
        })
    }
@ -174,49 +178,37 @@ impl ActualListingData {

 #[allow(clippy::too_many_arguments)]
 fn build_filter_feature_data(
+    df: &DataFrame,
    property_data: Option<&PropertyData>,
-    postcode: &[String],
-    address: &[Option<String>],
    property_type: &[Option<String>],
    leasehold_freehold: &[Option<String>],
    rooms_total: &[Option<i32>],
    floor_area_sqm: &[Option<f32>],
    asking_price: &[Option<i64>],
    asking_price_per_sqm: &[Option<f32>],
-) -> Vec<u16> {
+) -> Result<Vec<u16>> {
    let Some(property_data) = property_data else {
-        return Vec::new();
+        return Ok(Vec::new());
    };

    let num_features = property_data.num_features;
-    let mut feature_data = vec![NAN_U16; postcode.len() * num_features];
-    let mut joined_rows = 0usize;
+    let row_count = df.height();
+    let mut feature_data = vec![NAN_U16; row_count * num_features];
+    let quant = property_data.quant_ref();
+    let mut encoded_columns = 0usize;

-    for (row, postcode_value) in postcode.iter().enumerate() {
-        let Some(address_value) = address[row]
-            .as_deref()
-            .map(str::trim)
-            .filter(|v| !v.is_empty())
-        else {
-            continue;
-        };
-
-        let query = format!("{address_value} {postcode_value}");
-        let Some(&property_row) = property_data.search_addresses(&query, 1).first() else {
-            continue;
-        };
-        if property_data.postcode(property_row) != postcode_value {
-            continue;
+    for (feat_idx, name) in property_data.feature_names.iter().enumerate() {
+        if feat_idx < property_data.num_numeric {
+            if let Some(values) = extract_optional_feature_f32(df, name)? {
+                encode_numeric_feature(&mut feature_data, property_data, &quant, feat_idx, values);
+                encoded_columns += 1;
+            }
+        } else if let Some(values) = extract_optional_feature_str(df, name)? {
+            encode_enum_feature(&mut feature_data, property_data, feat_idx, values);
+            encoded_columns += 1;
        }
-
-        let dst = row * num_features;
-        let src = property_row * num_features;
-        feature_data[dst..dst + num_features]
-            .copy_from_slice(&property_data.feature_data[src..src + num_features]);
-        joined_rows += 1;
    }

-    let quant = property_data.quant_ref();
    overlay_numeric_feature(
        &mut feature_data,
        property_data,
@ -281,11 +273,50 @@ fn build_filter_feature_data(
    );

    info!(
-        rows = postcode.len(),
-        joined_rows, "Actual listings joined to property feature matrix"
+        rows = row_count,
+        encoded_columns, "Actual listings feature matrix read from enriched parquet"
    );

-    feature_data
+    Ok(feature_data)
+}
+
+fn build_poi_filter_feature_data(
+    df: &DataFrame,
+    property_data: Option<&PropertyData>,
+) -> Result<Vec<u16>> {
+    let Some(property_data) = property_data else {
+        return Ok(Vec::new());
+    };
+    let poi_metrics = &property_data.poi_metrics;
+    let num_features = poi_metrics.num_features();
+    if num_features == 0 {
+        return Ok(Vec::new());
+    }
+
+    let row_count = df.height();
+    let mut feature_data = vec![NAN_U16; row_count * num_features];
+    let quant = poi_metrics.quant_ref();
+    let mut encoded_columns = 0usize;
+
+    for (metric_idx, name) in poi_metrics.feature_names.iter().enumerate() {
+        let Some(values) = extract_optional_feature_f32(df, name)? else {
+            continue;
+        };
+        for (row, value) in values.into_iter().enumerate() {
+            let dst = row * num_features + metric_idx;
+            feature_data[dst] = value
+                .map(|value| encode_numeric_value(&quant, metric_idx, value))
+                .unwrap_or(NAN_U16);
+        }
+        encoded_columns += 1;
+    }
+
+    info!(
+        rows = row_count,
+        encoded_columns, "Actual listings POI metrics read from enriched parquet"
+    );
+
+    Ok(feature_data)
 }

 fn feature_index(property_data: &PropertyData, name: &str) -> Option<usize> {
@ -323,6 +354,53 @@ fn overlay_numeric_feature<I>(
    }
 }

+fn encode_numeric_feature<I>(
+    feature_data: &mut [u16],
+    property_data: &PropertyData,
+    quant: &QuantRef<'_>,
+    feat_idx: usize,
+    values: I,
+) where
+    I: IntoIterator<Item = Option<f32>>,
+{
+    let num_features = property_data.num_features;
+    for (row, value) in values.into_iter().enumerate() {
+        let dst = row * num_features + feat_idx;
+        feature_data[dst] = value
+            .map(|value| encode_numeric_value(quant, feat_idx, value))
+            .unwrap_or(NAN_U16);
+    }
+}
+
+fn extract_optional_feature_f32(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<f32>>>> {
+    let Ok(column) = df.column(name) else {
+        return Ok(None);
+    };
+
+    if matches!(column.dtype(), DataType::Datetime(_, _) | DataType::Date) {
+        let projected = df
+            .clone()
+            .lazy()
+            .select([(col(name).dt().year().cast(DataType::Float32)
+                + (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32)) / lit(12.0f32))
+            .alias("__feature")])
+            .collect()
+            .with_context(|| format!("Failed to convert datetime feature '{name}'"))?;
+        return Ok(Some(extract_opt_f32(&projected, "__feature")?));
+    }
+
+    let cast = column
+        .cast(&DataType::Float32)
+        .with_context(|| format!("Failed to cast feature '{name}' to Float32"))?;
+    let values = cast
+        .f32()
+        .with_context(|| format!("Feature '{name}' is not Float32"))?
+        .into_iter()
+        .map(|value| value.filter(|v| v.is_finite()))
+        .collect();
+    Ok(Some(values))
+}
+
 fn overlay_enum_feature<'a, I>(
    feature_data: &mut [u16],
    property_data: &PropertyData,
@ -355,6 +433,46 @@ fn overlay_enum_feature<'a, I>(
    }
 }

+fn encode_enum_feature(
+    feature_data: &mut [u16],
+    property_data: &PropertyData,
+    feat_idx: usize,
+    values: Vec<Option<String>>,
+) {
+    let Some(enum_values) = property_data.enum_values.get(&feat_idx) else {
+        return;
+    };
+    let num_features = property_data.num_features;
+    for (row, value) in values.into_iter().enumerate() {
+        let dst = row * num_features + feat_idx;
+        feature_data[dst] = value
+            .as_deref()
+            .map(str::trim)
+            .filter(|text| !text.is_empty())
+            .and_then(|text| enum_values.iter().position(|candidate| candidate == text))
+            .map(|position| position as u16)
+            .unwrap_or(NAN_U16);
+    }
+}
+
+fn extract_optional_feature_str(df: &DataFrame, name: &str) -> Result<Option<Vec<Option<String>>>> {
+    let Ok(column) = df.column(name) else {
+        return Ok(None);
+    };
+    let cast = column
+        .cast(&DataType::String)
+        .with_context(|| format!("Failed to cast feature '{name}' to String"))?;
+    let strings = cast
+        .str()
+        .with_context(|| format!("Feature '{name}' is not a string column"))?;
+    Ok(Some(
+        strings
+            .into_iter()
+            .map(|value| value.and_then(|text| (!text.trim().is_empty()).then(|| text.to_string())))
+            .collect(),
+    ))
+}
+
 fn encode_numeric_value(quant: &QuantRef<'_>, feat_idx: usize, value: f32) -> u16 {
    if !value.is_finite() {
        return NAN_U16;
@ -517,8 +635,13 @@ mod tests {
    use std::path::PathBuf;

    fn sample_path() -> Option<PathBuf> {
-        let path = PathBuf::from("../finder/data/online_listings_buy.parquet");
-        path.exists().then_some(path)
+        [
+            "../finder/data/online_listings_buy_enriched.parquet",
+            "../finder/data/online_listings_buy.parquet",
+        ]
+        .into_iter()
+        .map(PathBuf::from)
+        .find(|path| path.exists())
    }

    #[test]
--- a/server-rs/src/data/poi.rs
+++ b/server-rs/src/data/poi.rs
@ -63,7 +63,20 @@ const DASHBOARD_POI_GROUPS: &[(&str, &[&str])] = &[
    ("Groceries", GROCERY_DASHBOARD_CATEGORIES),
    ("Food & Drink", &["Café", "Restaurant", "Pub", "Fast Food"]),
    ("Green Space", &["Park", "Playground"]),
-    ("Education", &["School"]),
+    (
+        "Education",
+        &[
+            "Nursery school",
+            "Primary school",
+            "Secondary school",
+            "All-through school",
+            "Sixth form",
+            "Further education college",
+            "University",
+            "Special school",
+            "School",
+        ],
+    ),
    (
        "Health",
        &["GP Surgery", "Pharmacy", "Dentist", "Hospital & Clinic"],
@ -119,6 +132,21 @@ fn canonical_poi_category(category: &str) -> &str {
    }
 }

+/// Categories the pipeline emits for the GIAS-derived school POIs. A bare
+/// `poi=School` URL (predating the per-phase split) is expanded to all of these
+/// so bookmarked links keep showing schools.
+const SCHOOL_CATEGORY_ALIASES: &[&str] = &[
+    "Nursery school",
+    "Primary school",
+    "Secondary school",
+    "All-through school",
+    "Sixth form",
+    "Further education college",
+    "University",
+    "Special school",
+    "School",
+];
+
 pub fn resolve_poi_category_filter(category_values: &[String], categories: &str) -> FxHashSet<u16> {
    let mut selected = FxHashSet::default();
    for part in categories.split(',') {
@ -126,6 +154,12 @@ pub fn resolve_poi_category_filter(category_values: &[String], categories: &str)
        if category.is_empty() {
            continue;
        }
+        if category == "School" {
+            for alias in SCHOOL_CATEGORY_ALIASES {
+                add_category_filter_index(category_values, alias, &mut selected);
+            }
+            continue;
+        }
        add_category_filter_index(category_values, category, &mut selected);
    }
    selected
@ -174,6 +208,8 @@ pub struct SchoolMetadata {
    pub telephone: Option<String>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub head_name: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub ofsted_rating: Option<String>,
 }

 pub struct POIData {
@ -350,6 +386,8 @@ fn build_school_meta(
    let website = extract_optional_str_col(df, "school_website")?.unwrap_or_default();
    let telephone = extract_optional_str_col(df, "school_telephone")?.unwrap_or_default();
    let head_name = extract_optional_str_col(df, "school_head_name")?.unwrap_or_default();
+    let ofsted_rating =
+        extract_optional_str_col(df, "school_ofsted_rating")?.unwrap_or_default();

    let fetch_str = |col: &Vec<Option<String>>, row: usize| -> Option<String> {
        col.get(row).cloned().flatten()
@ -390,6 +428,7 @@ fn build_school_meta(
            website: fetch_str(&website, row),
            telephone: fetch_str(&telephone, row),
            head_name: fetch_str(&head_name, row),
+            ofsted_rating: fetch_str(&ofsted_rating, row),
        });
    }
    Ok((idx, meta))
@ -578,6 +617,26 @@ mod tests {
        assert!(selected.is_empty());
    }

+    #[test]
+    fn legacy_school_filter_expands_to_all_school_categories() {
+        // Bookmarked URLs from before the per-phase split sent `poi=School`;
+        // they should still match every school category that's loaded.
+        let values = vec![
+            "Primary school".to_string(),
+            "Secondary school".to_string(),
+            "University".to_string(),
+            "Tesco".to_string(),
+        ];
+
+        let selected = resolve_poi_category_filter(&values, "School");
+
+        assert!(selected.contains(&0));
+        assert!(selected.contains(&1));
+        assert!(selected.contains(&2));
+        assert!(!selected.contains(&3));
+        assert_eq!(selected.len(), 3);
+    }
+
    #[test]
    fn coop_category_aliases_resolve_to_single_category() {
        let values = vec!["Co-op".to_string(), "Tesco".to_string()];
--- a/server-rs/src/data/property.rs
+++ b/server-rs/src/data/property.rs
@ -891,6 +891,15 @@ impl PropertyData {
        (&self.postcode_interner, &self.postcode_keys)
    }

+    /// Property rows for a given postcode string, or empty if unknown.
+    pub fn rows_for_postcode(&self, postcode: &str) -> &[u32] {
+        self.postcode_interner
+            .get(postcode)
+            .and_then(|key| self.postcode_row_index.get(&key))
+            .map(Vec::as_slice)
+            .unwrap_or(&[])
+    }
+
    fn row_address_search_tokens(&self, row: usize) -> &[lasso::Spur] {
        let offset = self.address_search_token_offsets[row] as usize;
        let length = self.address_search_token_lengths[row] as usize;