has issues

2026-05-25 13:20:17 +01:00 · 2026-05-25 13:20:17 +01:00 · c645b0f1d4
commit c645b0f1d4
parent 2e112d7398
96 changed files with 2147083 additions and 5787 deletions
--- a/server-rs/src/data/crime_by_year.rs
+++ b/server-rs/src/data/crime_by_year.rs
@ -0,0 +1,187 @@
+//! Per-LSOA per-crime-type per-year crime counts, loaded from a side parquet
+//! and used by the right pane to plot crime-over-time. Filtering is not
+//! supported — this data is display-only.
+
+use std::path::Path;
+
+use anyhow::{bail, Context};
+use polars::lazy::frame::LazyFrame;
+use polars::prelude::PlRefPath;
+use polars::prelude::*;
+use rustc_hash::FxHashMap;
+use tracing::info;
+
+use super::run_polars_io;
+
+/// Suffix appended to the underlying crime-type column name in the parquet
+/// (e.g. `"Burglary (by year)"`). Stripped to derive the display name.
+pub const BY_YEAR_SUFFIX: &str = " (by year)";
+
+#[derive(Clone, Copy)]
+pub struct YearPoint {
+    pub year: i32,
+    pub count: f32,
+}
+
+/// One per crime type: ordered list of (year, count) for a single LSOA.
+pub struct LsoaCrimeSeries {
+    /// Index into `crime_types`.
+    pub type_idx: u16,
+    pub points: Vec<YearPoint>,
+}
+
+pub struct CrimeByYearData {
+    /// All crime type names in stable insertion order.
+    pub crime_types: Vec<String>,
+    /// All years available for each crime type, same order as `crime_types`.
+    pub years_by_type: Vec<Vec<i32>>,
+    /// LSOA code → all available per-type series for that LSOA.
+    pub series_by_lsoa: FxHashMap<String, Vec<LsoaCrimeSeries>>,
+}
+
+impl CrimeByYearData {
+    pub fn empty() -> Self {
+        Self {
+            crime_types: Vec::new(),
+            years_by_type: Vec::new(),
+            series_by_lsoa: FxHashMap::default(),
+        }
+    }
+
+    pub fn load(path: &Path) -> anyhow::Result<Self> {
+        run_polars_io(|| Self::load_inner(path))
+    }
+
+    fn load_inner(path: &Path) -> anyhow::Result<Self> {
+        info!("Loading crime-by-year from {}", path.display());
+        let pl_path = PlRefPath::try_from_path(path).with_context(|| {
+            format!(
+                "Failed to normalize crime-by-year parquet path {}",
+                path.display()
+            )
+        })?;
+        let df = LazyFrame::scan_parquet(pl_path, Default::default())
+            .with_context(|| format!("Failed to scan crime-by-year parquet at {}", path.display()))?
+            .collect()
+            .with_context(|| {
+                format!("Failed to read crime-by-year parquet at {}", path.display())
+            })?;
+
+        let lsoa_col = df
+            .column("LSOA code")
+            .context("crime-by-year parquet missing 'LSOA code' column")?
+            .str()
+            .context("'LSOA code' column is not a string")?;
+        let lsoa_values: Vec<String> = lsoa_col
+            .into_iter()
+            .enumerate()
+            .map(|(row, value)| {
+                let value =
+                    value.with_context(|| format!("crime-by-year row {row} has null LSOA code"))?;
+                let trimmed = value.trim();
+                if trimmed.is_empty() {
+                    bail!("crime-by-year row {row} has blank LSOA code");
+                }
+                Ok(trimmed.to_string())
+            })
+            .collect::<anyhow::Result<Vec<_>>>()?;
+
+        // Discover crime-type columns (anything with the by-year suffix).
+        let crime_type_cols: Vec<(String, String)> = df
+            .get_column_names()
+            .iter()
+            .filter_map(|name| {
+                let name = name.as_str();
+                name.strip_suffix(BY_YEAR_SUFFIX)
+                    .map(|stripped| (stripped.to_string(), name.to_string()))
+            })
+            .collect();
+
+        if crime_type_cols.is_empty() {
+            bail!(
+                "crime-by-year parquet at {} has no '* (by year)' columns",
+                path.display()
+            );
+        }
+
+        let crime_types: Vec<String> = crime_type_cols.iter().map(|(t, _)| t.clone()).collect();
+
+        let mut series_by_lsoa: FxHashMap<String, Vec<LsoaCrimeSeries>> = FxHashMap::default();
+        let mut years_by_type: Vec<Vec<i32>> = Vec::with_capacity(crime_type_cols.len());
+        let row_count = df.height();
+
+        for (type_idx, (_, col_name)) in crime_type_cols.iter().enumerate() {
+            let mut years_for_type = std::collections::BTreeSet::new();
+            let col = df
+                .column(col_name)
+                .with_context(|| format!("Missing crime-by-year column '{col_name}'"))?;
+            let list_ca = col
+                .list()
+                .with_context(|| format!("Column '{col_name}' is not a list"))?;
+
+            for row in 0..row_count {
+                let Some(inner) = list_ca.get_as_series(row) else {
+                    continue;
+                };
+                if inner.is_empty() {
+                    continue;
+                }
+                let structs = inner
+                    .struct_()
+                    .with_context(|| format!("Inner of '{col_name}' is not a struct"))?;
+                let years = structs
+                    .field_by_name("year")
+                    .with_context(|| format!("Missing 'year' field in '{col_name}'"))?;
+                let counts = structs
+                    .field_by_name("count")
+                    .with_context(|| format!("Missing 'count' field in '{col_name}'"))?;
+
+                let mut points: Vec<YearPoint> = Vec::with_capacity(inner.len());
+                for idx in 0..inner.len() {
+                    let yr = match years.get(idx).ok() {
+                        Some(AnyValue::Int32(y)) => y,
+                        Some(AnyValue::Int64(y)) => y as i32,
+                        _ => continue,
+                    };
+                    let cnt = match counts.get(idx).ok() {
+                        Some(AnyValue::Float32(c)) => c,
+                        Some(AnyValue::Float64(c)) => c as f32,
+                        Some(AnyValue::Int32(c)) => c as f32,
+                        Some(AnyValue::Int64(c)) => c as f32,
+                        _ => continue,
+                    };
+                    points.push(YearPoint {
+                        year: yr,
+                        count: cnt,
+                    });
+                    years_for_type.insert(yr);
+                }
+                if points.is_empty() {
+                    continue;
+                }
+                points.sort_by_key(|p| p.year);
+
+                series_by_lsoa
+                    .entry(lsoa_values[row].clone())
+                    .or_default()
+                    .push(LsoaCrimeSeries {
+                        type_idx: type_idx as u16,
+                        points,
+                    });
+            }
+            years_by_type.push(years_for_type.into_iter().collect());
+        }
+
+        info!(
+            lsoas = series_by_lsoa.len(),
+            crime_types = crime_types.len(),
+            "Crime-by-year data loaded"
+        );
+
+        Ok(Self {
+            crime_types,
+            years_by_type,
+            series_by_lsoa,
+        })
+    }
+}
--- a/server-rs/src/data/poi.rs
+++ b/server-rs/src/data/poi.rs
@ -131,6 +131,51 @@ pub fn resolve_poi_category_filter(category_values: &[String], categories: &str)
    selected
 }

+/// Metadata for state-funded school POIs (sourced from the DfE GIAS register).
+/// Every field is optional because GIAS does not populate every column for every
+/// establishment type (e.g. nurseries have no sixth form, FE colleges no FSM).
+#[derive(Serialize, Clone, Default)]
+pub struct SchoolMetadata {
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub phase: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub r#type: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub type_group: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub age_range: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub gender: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub religious_character: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub admissions_policy: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub nursery_provision: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sixth_form: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub capacity: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub pupils: Option<u32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub fsm_percent: Option<f32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub trust: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub address: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub postcode: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub local_authority: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub website: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub telephone: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub head_name: Option<String>,
+}
+
 pub struct POIData {
    /// Contiguous buffer holding all POI ID strings end-to-end.
    id_buffer: String,
@ -149,6 +194,11 @@ pub struct POIData {
    /// uniform subset when the POI count exceeds the per-request limit.
    /// Computed once at load time so the same POIs are always chosen for a given viewport.
    pub priority: Vec<u32>,
+    /// Indirection table: row idx → index into `school_meta`, or u32::MAX when
+    /// the POI is not a school. Keeps the per-row overhead at 4 bytes regardless
+    /// of how many school metadata fields we carry.
+    school_meta_idx: Vec<u32>,
+    school_meta: Vec<SchoolMetadata>,
 }

 impl POIData {
@ -158,6 +208,16 @@ impl POIData {
        let length = self.id_lengths[row] as usize;
        &self.id_buffer[offset..offset + length]
    }
+
+    /// Get the school metadata for a given row, or None if not a school.
+    pub fn school(&self, row: usize) -> Option<&SchoolMetadata> {
+        let idx = self.school_meta_idx[row];
+        if idx == u32::MAX {
+            None
+        } else {
+            Some(&self.school_meta[idx as usize])
+        }
+    }
 }

 fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
@ -195,6 +255,146 @@ fn extract_f32_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<f32>> {
        .collect()
 }

+/// Read an optional string column. Returns None when the column itself is missing
+/// (older POI parquets without the school_* extension); returns Some(vec) of
+/// length row_count where each entry is None for null cells.
+fn extract_optional_str_col(
+    df: &DataFrame,
+    name: &str,
+) -> anyhow::Result<Option<Vec<Option<String>>>> {
+    let column = match df.column(name) {
+        Ok(column) => column,
+        Err(_) => return Ok(None),
+    };
+    let string_column = column
+        .str()
+        .with_context(|| format!("Column '{name}' is not a string column"))?;
+    Ok(Some(
+        string_column
+            .into_iter()
+            .map(|value| value.map(ToString::to_string))
+            .collect(),
+    ))
+}
+
+fn extract_optional_u32_col(
+    df: &DataFrame,
+    name: &str,
+) -> anyhow::Result<Option<Vec<Option<u32>>>> {
+    let column = match df.column(name) {
+        Ok(column) => column,
+        Err(_) => return Ok(None),
+    };
+    let cast = column
+        .cast(&DataType::Int64)
+        .with_context(|| format!("Failed to cast column '{name}' to Int64"))?;
+    let int_column = cast
+        .i64()
+        .with_context(|| format!("Column '{name}' is not an integer column"))?;
+    Ok(Some(
+        int_column
+            .into_iter()
+            .map(|value| value.and_then(|v| if v < 0 { None } else { Some(v as u32) }))
+            .collect(),
+    ))
+}
+
+fn extract_optional_f32_col(
+    df: &DataFrame,
+    name: &str,
+) -> anyhow::Result<Option<Vec<Option<f32>>>> {
+    let column = match df.column(name) {
+        Ok(column) => column,
+        Err(_) => return Ok(None),
+    };
+    let cast = column
+        .cast(&DataType::Float32)
+        .with_context(|| format!("Failed to cast column '{name}' to Float32"))?;
+    let float_column = cast
+        .f32()
+        .with_context(|| format!("Column '{name}' is not a float32 column"))?;
+    Ok(Some(float_column.into_iter().collect()))
+}
+
+fn build_school_meta(
+    row_count: usize,
+    df: &DataFrame,
+) -> anyhow::Result<(Vec<u32>, Vec<SchoolMetadata>)> {
+    let phase = extract_optional_str_col(df, "school_phase")?;
+    if phase.is_none() {
+        // POI parquet predates the school metadata extension — record an empty
+        // table and a sentinel-filled index, so callers transparently see None.
+        return Ok((vec![u32::MAX; row_count], Vec::new()));
+    }
+
+    let phase = phase.unwrap();
+    let r#type = extract_optional_str_col(df, "school_type")?.unwrap_or_default();
+    let type_group = extract_optional_str_col(df, "school_type_group")?.unwrap_or_default();
+    let age_range = extract_optional_str_col(df, "school_age_range")?.unwrap_or_default();
+    let gender = extract_optional_str_col(df, "school_gender")?.unwrap_or_default();
+    let religious_character =
+        extract_optional_str_col(df, "school_religious_character")?.unwrap_or_default();
+    let admissions_policy =
+        extract_optional_str_col(df, "school_admissions_policy")?.unwrap_or_default();
+    let nursery_provision =
+        extract_optional_str_col(df, "school_nursery_provision")?.unwrap_or_default();
+    let sixth_form = extract_optional_str_col(df, "school_sixth_form")?.unwrap_or_default();
+    let capacity = extract_optional_u32_col(df, "school_capacity")?.unwrap_or_default();
+    let pupils = extract_optional_u32_col(df, "school_pupils")?.unwrap_or_default();
+    let fsm_percent = extract_optional_f32_col(df, "school_fsm_percent")?.unwrap_or_default();
+    let trust = extract_optional_str_col(df, "school_trust")?.unwrap_or_default();
+    let address = extract_optional_str_col(df, "school_address")?.unwrap_or_default();
+    let postcode = extract_optional_str_col(df, "school_postcode")?.unwrap_or_default();
+    let local_authority =
+        extract_optional_str_col(df, "school_local_authority")?.unwrap_or_default();
+    let website = extract_optional_str_col(df, "school_website")?.unwrap_or_default();
+    let telephone = extract_optional_str_col(df, "school_telephone")?.unwrap_or_default();
+    let head_name = extract_optional_str_col(df, "school_head_name")?.unwrap_or_default();
+
+    let fetch_str = |col: &Vec<Option<String>>, row: usize| -> Option<String> {
+        col.get(row).cloned().flatten()
+    };
+    let fetch_u32 =
+        |col: &Vec<Option<u32>>, row: usize| -> Option<u32> { col.get(row).copied().flatten() };
+    let fetch_f32 =
+        |col: &Vec<Option<f32>>, row: usize| -> Option<f32> { col.get(row).copied().flatten() };
+
+    let mut idx = vec![u32::MAX; row_count];
+    let mut meta = Vec::new();
+    for row in 0..row_count {
+        let type_group_val = fetch_str(&type_group, row);
+        let type_val = fetch_str(&r#type, row);
+        // type_group is present for every GIAS row, so use it as the sentinel
+        // for "this POI is a school" — matches the pipeline guarantee.
+        if type_group_val.is_none() && type_val.is_none() {
+            continue;
+        }
+        idx[row] = meta.len() as u32;
+        meta.push(SchoolMetadata {
+            phase: fetch_str(&phase, row),
+            r#type: type_val,
+            type_group: type_group_val,
+            age_range: fetch_str(&age_range, row),
+            gender: fetch_str(&gender, row),
+            religious_character: fetch_str(&religious_character, row),
+            admissions_policy: fetch_str(&admissions_policy, row),
+            nursery_provision: fetch_str(&nursery_provision, row),
+            sixth_form: fetch_str(&sixth_form, row),
+            capacity: fetch_u32(&capacity, row),
+            pupils: fetch_u32(&pupils, row),
+            fsm_percent: fetch_f32(&fsm_percent, row),
+            trust: fetch_str(&trust, row),
+            address: fetch_str(&address, row),
+            postcode: fetch_str(&postcode, row),
+            local_authority: fetch_str(&local_authority, row),
+            website: fetch_str(&website, row),
+            telephone: fetch_str(&telephone, row),
+            head_name: fetch_str(&head_name, row),
+        });
+    }
+    Ok((idx, meta))
+}
+
 impl POIData {
    pub fn load(parquet_path: &Path) -> anyhow::Result<Self> {
        super::run_polars_io(|| Self::load_inner(parquet_path))
@ -259,6 +459,9 @@ impl POIData {
        // preventing visual "shuffling" when panning the map.
        let priority = generate_priorities(row_count);

+        let (school_meta_idx, school_meta) = build_school_meta(row_count, &df)?;
+        info!(schools = school_meta.len(), "Loaded GIAS school metadata");
+
        info!("POI data loading complete.");

        Ok(POIData {
@ -273,6 +476,8 @@ impl POIData {
            lng,
            emoji,
            priority,
+            school_meta_idx,
+            school_meta,
        })
    }

--- a/server-rs/src/data/property.rs
+++ b/server-rs/src/data/property.rs
@ -569,6 +569,13 @@ pub struct RenovationEvent {
    pub event: String,
 }

+#[derive(Serialize, Clone)]
+pub struct HistoricalPrice {
+    pub year: i32,
+    pub month: u8,
+    pub price: i64,
+}
+
 /// Lightweight reference to quantization parameters for decoding u16 feature data.
 pub struct QuantRef<'a> {
    pub dequant_a: &'a [f32],
@ -824,6 +831,10 @@ pub struct PropertyData {
    /// Interned postcodes: reader is thread-safe, keys index into it.
    postcode_interner: lasso::RodeoReader,
    postcode_keys: Vec<lasso::Spur>,
+    /// Interned LSOA (2021) codes per row.
+    /// Used to look up per-LSOA side tables (e.g. crime time series).
+    lsoa_interner: lasso::RodeoReader,
+    lsoa_keys: Vec<lasso::Spur>,
    /// Rows for each postcode, keyed by the interned postcode key.
    postcode_row_index: FxHashMap<lasso::Spur, Vec<u32>>,
    /// Inverted index from address tokens to property rows.
@ -850,6 +861,9 @@ pub struct PropertyData {
    /// Per-row renovation events. Keyed by (permuted) row index.
    /// Only rows with events are present in the map.
    renovation_history: FxHashMap<u32, Vec<RenovationEvent>>,
+    /// Per-row historical sale transactions (Land Registry price-paid).
+    /// Keyed by (permuted) row index. Only rows with prices are present.
+    historical_prices: FxHashMap<u32, Vec<HistoricalPrice>>,
    property_sub_type: FxHashMap<u32, String>,
    price_qualifier: FxHashMap<u32, String>,
 }
@ -867,6 +881,11 @@ impl PropertyData {
        self.postcode_interner.resolve(&self.postcode_keys[row])
    }

+    /// Get the LSOA (2021) code for a given row.
+    pub fn lsoa(&self, row: usize) -> &str {
+        self.lsoa_interner.resolve(&self.lsoa_keys[row])
+    }
+
    /// Get postcode components for field-level borrowing (avoids conflicting borrows with feature_data).
    pub fn postcode_parts(&self) -> (&lasso::RodeoReader, &[lasso::Spur]) {
        (&self.postcode_interner, &self.postcode_keys)
@ -1044,6 +1063,14 @@ impl PropertyData {
            .unwrap_or(&[])
    }

+    /// Get historical sale transactions for a given row (empty slice if none).
+    pub fn historical_prices(&self, row: usize) -> &[HistoricalPrice] {
+        self.historical_prices
+            .get(&(row as u32))
+            .map(|v| v.as_slice())
+            .unwrap_or(&[])
+    }
+
    /// Get property sub-type for a given row.
    pub fn property_sub_type(&self, row: usize) -> Option<&str> {
        self.property_sub_type
@ -1505,6 +1532,15 @@ impl PropertyData {
            }
        }

+        // LSOA (2021) code per row, brought in via the postcode join. Used as a
+        // lookup key into per-LSOA side tables (e.g. crime time series).
+        match schema.get("lsoa21") {
+            Some(dtype) if matches!(dtype, DataType::String) || dtype.is_categorical() => {}
+            Some(dtype) => bail!("Column 'lsoa21' has unexpected type {:?}", dtype),
+            None => bail!("Required column 'lsoa21' not found in joined property data"),
+        }
+        select_exprs.push(col("lsoa21").cast(DataType::String));
+
        // Enum features as String
        for &name in &enum_names {
            select_exprs.push(col(name).cast(DataType::String));
@ -1519,6 +1555,10 @@ impl PropertyData {
        if has_renovation_history {
            select_exprs.push(col("renovation_history"));
        }
+        let has_historical_prices = schema.get("historical_prices").is_some();
+        if has_historical_prices {
+            select_exprs.push(col("historical_prices"));
+        }
        let df = combined_lf
            .filter(col("lat").is_not_null().and(col("lon").is_not_null()))
            .select(select_exprs)
@ -1655,9 +1695,33 @@ impl PropertyData {
                    Ok(vec![None; row_count])
                }
            };
+        let extract_required_trimmed_string_col =
+            |df: &DataFrame, name: &str| -> anyhow::Result<Vec<String>> {
+                let column = df
+                    .column(name)
+                    .with_context(|| format!("Required column '{name}' not found in parquet"))?;
+                let string_column = column
+                    .str()
+                    .with_context(|| format!("Column '{name}' is not a string column"))?;
+                string_column
+                    .into_iter()
+                    .enumerate()
+                    .map(|(row, value)| {
+                        let value = value.with_context(|| {
+                            format!("Required column '{name}' has null at row {row}")
+                        })?;
+                        let trimmed = value.trim();
+                        if trimmed.is_empty() {
+                            bail!("Required column '{name}' has blank value at row {row}");
+                        }
+                        Ok(trimmed.to_string())
+                    })
+                    .collect()
+            };

        let property_sub_type_raw = extract_optional_string_col(&df, "Property sub-type")?;
        let price_qualifier_raw = extract_optional_string_col(&df, "Price qualifier")?;
+        let lsoa_raw = extract_required_trimmed_string_col(&df, "lsoa21")?;

        tracing::info!("Building enum features");
        // enum_col_major: Vec<(values_list, encoded_as_f32)>
@ -1801,6 +1865,70 @@ impl PropertyData {
            FxHashMap::default()
        };

+        // Extract historical_prices: List<Struct{year: i32, month: u8, price: i64}>
+        let mut historical_prices_raw: FxHashMap<u32, Vec<HistoricalPrice>> =
+            if has_historical_prices {
+                tracing::info!("Extracting historical prices");
+                let prices_col = df
+                    .column("historical_prices")
+                    .context("Missing historical_prices column")?;
+                let list_ca = prices_col
+                    .list()
+                    .context("historical_prices is not a list column")?;
+
+                let mut history: FxHashMap<u32, Vec<HistoricalPrice>> = FxHashMap::default();
+                for old_row in 0..row_count {
+                    if let Some(inner) = list_ca.get_as_series(old_row) {
+                        if inner.is_empty() {
+                            continue;
+                        }
+                        let structs = inner
+                            .struct_()
+                            .context("historical_prices inner is not a struct")?;
+                        let years = structs
+                            .field_by_name("year")
+                            .context("Missing 'year' field in historical_prices struct")?;
+                        let months = structs
+                            .field_by_name("month")
+                            .context("Missing 'month' field in historical_prices struct")?;
+                        let prices = structs
+                            .field_by_name("price")
+                            .context("Missing 'price' field in historical_prices struct")?;
+
+                        let mut row_prices = Vec::new();
+                        for idx in 0..inner.len() {
+                            let year = years.get(idx).context("Failed to get year value")?;
+                            let month = months.get(idx).context("Failed to get month value")?;
+                            let price = prices.get(idx).context("Failed to get price value")?;
+                            let AnyValue::Int32(year_i32) = year else {
+                                bail!("historical_prices.year is not Int32 at row {old_row}, got {year:?}");
+                            };
+                            let AnyValue::UInt8(month_u8) = month else {
+                                bail!("historical_prices.month is not UInt8 at row {old_row}, got {month:?}");
+                            };
+                            let AnyValue::Int64(price_i64) = price else {
+                                bail!("historical_prices.price is not Int64 at row {old_row}, got {price:?}");
+                            };
+                            row_prices.push(HistoricalPrice {
+                                year: year_i32,
+                                month: month_u8,
+                                price: price_i64,
+                            });
+                        }
+                        if !row_prices.is_empty() {
+                            history.insert(old_row as u32, row_prices);
+                        }
+                    }
+                }
+                tracing::info!(
+                    properties_with_prices = history.len(),
+                    "Historical prices extracted"
+                );
+                history
+            } else {
+                FxHashMap::default()
+            };
+
        // Free the projected joined frame before building the row-major matrix.
        drop(df);

@ -1904,6 +2032,14 @@ impl PropertyData {
        }
        let postcode_interner = postcode_rodeo.into_reader();

+        // Intern LSOA codes (permuted).
+        let mut lsoa_rodeo = lasso::Rodeo::default();
+        let mut lsoa_keys: Vec<lasso::Spur> = Vec::with_capacity(row_count);
+        for &perm_index in perm.iter() {
+            lsoa_keys.push(lsoa_rodeo.get_or_intern(&lsoa_raw[perm_index as usize]));
+        }
+        let lsoa_interner = lsoa_rodeo.into_reader();
+
        let row_to_poi_metric_idx: Vec<u32> = if poi_metrics.is_empty() {
            vec![NO_POI_METRIC_ROW; row_count]
        } else {
@ -1939,6 +2075,20 @@ impl PropertyData {
            map
        };

+        // Re-key historical_prices by permuted row index
+        let historical_prices: FxHashMap<u32, Vec<HistoricalPrice>> = {
+            let mut map = FxHashMap::with_capacity_and_hasher(
+                historical_prices_raw.len(),
+                Default::default(),
+            );
+            for (new_row, &old_row) in perm.iter().enumerate() {
+                if let Some(prices) = historical_prices_raw.remove(&old_row) {
+                    map.insert(new_row as u32, prices);
+                }
+            }
+            map
+        };
+
        // Permute optional string columns into sparse HashMaps
        let property_sub_type: FxHashMap<u32, String> = {
            let mut map = FxHashMap::default();
@ -2061,6 +2211,8 @@ impl PropertyData {
            address_lengths,
            postcode_interner,
            postcode_keys,
+            lsoa_interner,
+            lsoa_keys,
            postcode_row_index,
            address_token_index,
            address_prefix_index,
@ -2072,6 +2224,7 @@ impl PropertyData {
            enum_counts,
            approx_build_date_bits,
            renovation_history,
+            historical_prices,
            property_sub_type,
            price_qualifier,
        })