Rust things

This commit is contained in:
Andras Schmelczer 2026-05-10 14:55:43 +01:00
parent fc10381692
commit 3debacab4f
30 changed files with 3257 additions and 647 deletions

View file

@ -14,6 +14,7 @@ const ADDRESS_SEARCH_CANDIDATE_LIMIT: usize = 50_000;
const ADDRESS_SEARCH_MAX_POSTINGS_PER_TOKEN: usize = 250_000;
const ADDRESS_SEARCH_PREFIX_MIN_LEN: usize = 4;
const ADDRESS_SEARCH_PREFIX_MAX_LEN: usize = 8;
const NO_POI_METRIC_ROW: u32 = u32::MAX;
fn is_numeric_dtype(dtype: &DataType) -> bool {
matches!(
@ -495,6 +496,187 @@ impl QuantRef<'_> {
}
}
pub struct PostcodePoiMetrics {
pub feature_names: Vec<String>,
pub name_to_index: FxHashMap<String, usize>,
/// Metric-major storage: columns[metric_idx][postcode_metric_idx].
pub columns: Vec<Vec<u16>>,
pub feature_stats: Vec<FeatureStats>,
/// Per-property row lookup into the postcode metric table.
row_to_metric_idx: Vec<u32>,
dequant_a: Vec<f32>,
quant_min: Vec<f32>,
quant_range: Vec<f32>,
}
impl PostcodePoiMetrics {
fn empty(row_count: usize) -> Self {
Self {
feature_names: Vec::new(),
name_to_index: FxHashMap::default(),
columns: Vec::new(),
feature_stats: Vec::new(),
row_to_metric_idx: vec![NO_POI_METRIC_ROW; row_count],
dequant_a: Vec::new(),
quant_min: Vec::new(),
quant_range: Vec::new(),
}
}
fn from_postcode_df(df: &DataFrame, feature_names: Vec<String>) -> anyhow::Result<Self> {
if feature_names.is_empty() {
return Ok(Self::empty(0));
}
tracing::info!(
metrics = feature_names.len(),
postcodes = df.height(),
"Building postcode POI metric side table"
);
let col_major: Vec<Vec<f32>> = feature_names
.par_iter()
.map(|name| {
let column = df
.column(name.as_str())
.with_context(|| format!("Missing POI metric column '{name}'"))?;
column_to_f32_vec(column)
})
.collect::<anyhow::Result<Vec<_>>>()?;
let feature_stats: Vec<FeatureStats> = col_major
.par_iter()
.enumerate()
.map(|(metric_idx, vals)| {
let name = feature_names[metric_idx].as_str();
let bounds = features::bounds_for(name)
.with_context(|| format!("No bounds config for POI metric '{name}'"))?;
Ok(compute_feature_stats(
vals,
&bounds,
features::has_integer_bins(name),
))
})
.collect::<anyhow::Result<Vec<_>>>()?;
let mut quant_min = Vec::with_capacity(feature_names.len());
let mut quant_range = Vec::with_capacity(feature_names.len());
for (metric_idx, stats) in feature_stats.iter().enumerate() {
let (min, max) = match features::bounds_for(feature_names[metric_idx].as_str()) {
Some(Bounds::Fixed { min, max }) => (min, max),
_ => (stats.histogram.min, stats.histogram.max),
};
quant_min.push(min);
quant_range.push(if max > min { max - min } else { 0.0 });
}
let dequant_a: Vec<f32> = quant_range
.iter()
.map(|&range| {
if range > 0.0 {
range / QUANT_SCALE
} else {
0.0
}
})
.collect();
let columns: Vec<Vec<u16>> = col_major
.par_iter()
.enumerate()
.map(|(metric_idx, vals)| {
let range = quant_range[metric_idx];
let min = quant_min[metric_idx];
vals.iter()
.map(|&value| {
if !value.is_finite() {
NAN_U16
} else if range > 0.0 {
let normalized = (value - min) / range;
(normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
} else {
0
}
})
.collect()
})
.collect();
let name_to_index = feature_names
.iter()
.enumerate()
.map(|(idx, name)| (name.clone(), idx))
.collect();
Ok(Self {
feature_names,
name_to_index,
columns,
feature_stats,
row_to_metric_idx: Vec::new(),
dequant_a,
quant_min,
quant_range,
})
}
fn set_row_mapping(&mut self, row_to_metric_idx: Vec<u32>) {
self.row_to_metric_idx = row_to_metric_idx;
}
pub fn is_empty(&self) -> bool {
self.feature_names.is_empty()
}
pub fn num_features(&self) -> usize {
self.feature_names.len()
}
pub fn quant_ref(&self) -> QuantRef<'_> {
QuantRef {
dequant_a: &self.dequant_a,
quant_min: &self.quant_min,
quant_range: &self.quant_range,
num_numeric: self.feature_names.len(),
}
}
#[inline]
pub fn metric_row_for_property(&self, row: usize) -> Option<usize> {
self.row_to_metric_idx
.get(row)
.copied()
.filter(|&idx| idx != NO_POI_METRIC_ROW)
.map(|idx| idx as usize)
}
#[inline]
pub fn raw_for_metric_row(&self, metric_row: usize, metric_idx: usize) -> u16 {
self.columns[metric_idx][metric_row]
}
#[inline]
pub fn raw_for_property_row(&self, row: usize, metric_idx: usize) -> u16 {
let Some(metric_row) = self.metric_row_for_property(row) else {
return NAN_U16;
};
self.raw_for_metric_row(metric_row, metric_idx)
}
#[inline]
pub fn decode_raw(&self, metric_idx: usize, raw: u16) -> f32 {
if raw == NAN_U16 {
f32::NAN
} else {
raw as f32 * self.dequant_a[metric_idx] + self.quant_min[metric_idx]
}
}
#[inline]
pub fn get_for_property_row(&self, row: usize, metric_idx: usize) -> f32 {
self.decode_raw(metric_idx, self.raw_for_property_row(row, metric_idx))
}
}
pub struct PropertyData {
pub lat: Vec<f32>,
pub lon: Vec<f32>,
@ -514,6 +696,7 @@ pub struct PropertyData {
/// Per-feature: max - min (for encoding filter bounds).
quant_range: Vec<f32>,
pub feature_stats: Vec<FeatureStats>,
pub poi_metrics: PostcodePoiMetrics,
/// Unquantized last sale price used by the price-history chart.
last_known_price_raw: Vec<f32>,
/// Contiguous buffer holding all address strings end-to-end.
@ -1055,19 +1238,54 @@ pub fn precompute_h3(lat: &[f32], lon: &[f32]) -> anyhow::Result<Vec<u64>> {
impl PropertyData {
pub fn load(properties_path: &Path, postcode_features_path: &Path) -> anyhow::Result<Self> {
super::run_polars_io(|| Self::load_inner(properties_path, postcode_features_path))
}
fn load_inner(properties_path: &Path, postcode_features_path: &Path) -> anyhow::Result<Self> {
// Load postcode.parquet
tracing::info!(
"Loading postcode features from {:?}",
postcode_features_path
);
let postcode_features_path = PlRefPath::try_from_path(postcode_features_path)
.context("Failed to normalize postcode parquet path")?;
let postcode_df = LazyFrame::scan_parquet(postcode_features_path, Default::default())
.context("Failed to scan postcode parquet")?
.collect()
.context("Failed to read postcode parquet")?;
tracing::info!(rows = postcode_df.height(), "Postcode features loaded");
let mut poi_metric_names: Vec<String> = postcode_df
.get_column_names()
.iter()
.map(|name| name.as_str())
.filter(|&name| features::is_dynamic_poi_feature(name))
.map(str::to_string)
.collect();
poi_metric_names.sort_by_key(|name| features::dynamic_poi_feature_sort_key(name));
let poi_metric_by_postcode: FxHashMap<String, u32> = if poi_metric_names.is_empty() {
FxHashMap::default()
} else {
let postcode_column = postcode_df
.column("Postcode")
.context("Postcode feature parquet missing 'Postcode' column")?
.str()
.context("'Postcode' column in postcode feature parquet is not a string")?;
postcode_column
.into_iter()
.enumerate()
.filter_map(|(idx, postcode)| {
postcode.map(|postcode| (postcode.to_string(), idx as u32))
})
.collect()
};
let mut poi_metrics = PostcodePoiMetrics::from_postcode_df(&postcode_df, poi_metric_names)?;
// Load properties.parquet and join with postcode data for lat/lon + area features
tracing::info!("Loading properties from {:?}", properties_path);
let properties_path = PlRefPath::try_from_path(properties_path)
.context("Failed to normalize properties parquet path")?;
let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default())
.context("Failed to scan properties parquet")?;
let combined = properties_lf
@ -1082,14 +1300,20 @@ impl PropertyData {
let total_rows = combined.height();
tracing::info!(rows = total_rows, "Properties joined with postcodes");
// Get configured feature/enum names in config order
let numeric_names = features::all_numeric_feature_names();
// Get configured feature/enum names in config order. Dynamic POI
// metrics live in a postcode-level side table so they do not widen the
// hot row-major property feature matrix.
let configured_numeric_names = features::all_numeric_feature_names();
let enum_names = features::all_enum_feature_names();
let schema = combined.schema();
let numeric_names: Vec<String> = configured_numeric_names
.iter()
.map(|name| (*name).to_string())
.collect();
for name in &numeric_names {
match schema.get(name) {
match schema.get(name.as_str()) {
Some(dtype) if is_numeric_dtype(dtype) => {}
Some(dtype) => bail!(
"Configured numeric feature '{}' has non-numeric type {:?}",
@ -1120,8 +1344,8 @@ impl PropertyData {
// Combine numeric and enum feature names (numeric first, then enum)
let feature_names: Vec<String> = numeric_names
.iter()
.chain(enum_names.iter())
.map(|name| name.to_string())
.chain(enum_names.iter().map(|name| name.to_string()))
.collect();
let num_features = feature_names.len();
let num_numeric = numeric_names.len();
@ -1138,16 +1362,16 @@ impl PropertyData {
select_exprs.push(col("lon").cast(DataType::Float32));
// Select numeric features as Float32 (datetime columns → fractional year)
for &name in &numeric_names {
if is_datetime_dtype(schema.get(name).unwrap()) {
for name in &numeric_names {
if is_datetime_dtype(schema.get(name.as_str()).unwrap()) {
select_exprs.push(
(col(name).dt().year().cast(DataType::Float32)
+ (col(name).dt().month().cast(DataType::Float32) - lit(1.0f32))
(col(name.as_str()).dt().year().cast(DataType::Float32)
+ (col(name.as_str()).dt().month().cast(DataType::Float32) - lit(1.0f32))
/ lit(12.0f32))
.alias(name),
.alias(name.as_str()),
);
} else {
select_exprs.push(col(name).cast(DataType::Float32));
select_exprs.push(col(name.as_str()).cast(DataType::Float32));
}
}
@ -1233,7 +1457,7 @@ impl PropertyData {
.par_iter()
.map(|name| {
let column = df
.column(name)
.column(name.as_str())
.with_context(|| format!("Missing feature column '{name}'"))?;
column_to_f32_vec(column)
})
@ -1244,10 +1468,10 @@ impl PropertyData {
.par_iter()
.enumerate()
.map(|(feat_index, vals)| {
let name = numeric_names[feat_index];
let name = numeric_names[feat_index].as_str();
let bounds = features::bounds_for(name)
.with_context(|| format!("No bounds config for feature '{}'", name))?;
let stats = compute_feature_stats(vals, bounds, features::has_integer_bins(name));
let stats = compute_feature_stats(vals, &bounds, features::has_integer_bins(name));
tracing::debug!(
feature = %name,
slider_min = format_args!("{:.2}", stats.slider_min),
@ -1268,8 +1492,8 @@ impl PropertyData {
let mut quant_min = Vec::with_capacity(num_features);
let mut quant_range = Vec::with_capacity(num_features);
for (feat_idx, stats) in numeric_feature_stats.iter().enumerate() {
let (min, max) = match features::bounds_for(numeric_names[feat_idx]) {
Some(Bounds::Fixed { min, max }) => (*min, *max),
let (min, max) = match features::bounds_for(numeric_names[feat_idx].as_str()) {
Some(Bounds::Fixed { min, max }) => (min, max),
_ => (stats.histogram.min, stats.histogram.max),
};
quant_min.push(min);
@ -1284,10 +1508,15 @@ impl PropertyData {
let string_column = column
.str()
.with_context(|| format!("Column '{name}' is not a string column"))?;
Ok(string_column
string_column
.into_iter()
.map(|value| value.unwrap_or("").to_string())
.collect())
.enumerate()
.map(|(row, value)| {
value
.map(ToString::to_string)
.with_context(|| format!("Required column '{name}' has null at row {row}"))
})
.collect()
};
let address_raw = extract_string_col(&df, "Address per Property Register")?;
@ -1325,18 +1554,18 @@ impl PropertyData {
// enum_col_major: Vec<(values_list, encoded_as_f32)>
let enum_col_major: Vec<(Vec<String>, Vec<f32>)> = enum_names
.par_iter()
.filter_map(|&name| {
let column_data = df.column(name).ok()?;
let string_column = column_data.str().ok()?;
.map(|&name| -> anyhow::Result<(Vec<String>, Vec<f32>)> {
let column_data = df
.column(name)
.with_context(|| format!("Required enum column '{name}' not found"))?;
let string_column = column_data
.str()
.with_context(|| format!("Enum column '{name}' is not a string column"))?;
let unique_set: std::collections::HashSet<String> = string_column
.into_iter()
.filter_map(|value| {
let text = value.unwrap_or("");
if text.is_empty() {
None
} else {
Some(text.to_string())
}
let text = value?.trim();
(!text.is_empty()).then(|| text.to_string())
})
.collect();
@ -1373,20 +1602,22 @@ impl PropertyData {
let encoded: Vec<f32> = string_column
.into_iter()
.map(|value| {
let text = value.unwrap_or("");
if text.is_empty() {
f32::NAN
} else {
*value_to_idx.get(text).unwrap_or(&f32::NAN)
}
.enumerate()
.map(|(row, value)| {
let Some(text) = value.map(str::trim).filter(|text| !text.is_empty())
else {
return Ok(f32::NAN);
};
value_to_idx.get(text).copied().with_context(|| {
format!("Enum column '{name}' has unknown value '{text}' at row {row}")
})
})
.collect();
.collect::<anyhow::Result<Vec<_>>>()?;
tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32");
Some((unique, encoded))
Ok((unique, encoded))
})
.collect();
.collect::<anyhow::Result<Vec<_>>>()?;
// Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate
let is_approx_build_date_raw: Vec<bool> = if has_approx_col {
@ -1487,13 +1718,13 @@ impl PropertyData {
.collect();
let last_known_price_raw: Vec<f32> = numeric_names
.iter()
.position(|&name| name == "Last known price")
.position(|name| name == "Last known price")
.map(|price_idx| {
perm.iter()
.map(|&perm_index| numeric_col_major[price_idx][perm_index as usize])
.collect()
})
.unwrap_or_else(|| vec![f32::NAN; row_count]);
.context("Required numeric column 'Last known price' not configured")?;
// Build contiguous address buffer and address search index (permuted)
tracing::info!("Building interned strings");
@ -1561,6 +1792,20 @@ impl PropertyData {
}
let postcode_interner = postcode_rodeo.into_reader();
let row_to_poi_metric_idx: Vec<u32> = if poi_metrics.is_empty() {
vec![NO_POI_METRIC_ROW; row_count]
} else {
perm.iter()
.map(|&old_row| {
poi_metric_by_postcode
.get(postcode_raw[old_row as usize].as_str())
.copied()
.unwrap_or(NO_POI_METRIC_ROW)
})
.collect()
};
poi_metrics.set_row_mapping(row_to_poi_metric_idx);
// Pack is_approx_build_date into a bitvec (8 bools per byte)
let num_bytes = row_count.div_ceil(8);
let mut approx_build_date_bits = vec![0u8; num_bytes];
@ -1697,6 +1942,7 @@ impl PropertyData {
quant_min,
quant_range,
feature_stats,
poi_metrics,
last_known_price_raw,
address_buffer,
address_offsets,