vibes
This commit is contained in:
parent
80c093b7ba
commit
f72c43a9fa
101 changed files with 2168 additions and 1177 deletions
|
|
@ -18,7 +18,12 @@ pub struct POICategoryGroup {
|
|||
}
|
||||
|
||||
pub struct POIData {
|
||||
pub id: Vec<String>,
|
||||
/// Contiguous buffer holding all POI ID strings end-to-end.
|
||||
id_buffer: String,
|
||||
/// Byte offset into `id_buffer` where each row's ID starts.
|
||||
id_offsets: Vec<u32>,
|
||||
/// Length in bytes of each row's ID.
|
||||
id_lengths: Vec<u8>,
|
||||
pub group: InternedColumn,
|
||||
pub category: InternedColumn,
|
||||
pub name: Vec<String>,
|
||||
|
|
@ -31,6 +36,15 @@ pub struct POIData {
|
|||
pub priority: Vec<u32>,
|
||||
}
|
||||
|
||||
impl POIData {
|
||||
/// Get the ID string for a given row.
|
||||
pub fn id(&self, row: usize) -> &str {
|
||||
let offset = self.id_offsets[row] as usize;
|
||||
let length = self.id_lengths[row] as usize;
|
||||
&self.id_buffer[offset..offset + length]
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_str_col(df: &DataFrame, name: &str) -> anyhow::Result<Vec<String>> {
|
||||
let column = df
|
||||
.column(name)
|
||||
|
|
@ -72,7 +86,7 @@ impl POIData {
|
|||
let row_count = df.height();
|
||||
info!("Loaded {} POIs", row_count);
|
||||
|
||||
let id: Vec<String> = extract_str_col(&df, "id")?;
|
||||
let id_raw: Vec<String> = extract_str_col(&df, "id")?;
|
||||
let name = extract_str_col(&df, "name")?;
|
||||
let category_raw = extract_str_col(&df, "category")?;
|
||||
let group_raw = extract_str_col(&df, "group")?;
|
||||
|
|
@ -80,6 +94,19 @@ impl POIData {
|
|||
let lng = extract_f32_col(&df, "lng", 0.0)?;
|
||||
let emoji_raw = extract_str_col(&df, "emoji")?;
|
||||
|
||||
// Pack POI IDs into a contiguous buffer
|
||||
let total_id_bytes: usize = id_raw.iter().map(|s| s.len()).sum();
|
||||
let mut id_buffer = String::with_capacity(total_id_bytes);
|
||||
let mut id_offsets = Vec::with_capacity(row_count);
|
||||
let mut id_lengths = Vec::with_capacity(row_count);
|
||||
for s in &id_raw {
|
||||
let offset = id_buffer.len() as u32;
|
||||
let length = s.len().min(u8::MAX as usize) as u8;
|
||||
id_offsets.push(offset);
|
||||
id_lengths.push(length);
|
||||
id_buffer.push_str(&s[..length as usize]);
|
||||
}
|
||||
|
||||
let category = InternedColumn::build(&category_raw);
|
||||
let group = InternedColumn::build(&group_raw);
|
||||
let emoji = InternedColumn::build(&emoji_raw);
|
||||
|
|
@ -99,7 +126,9 @@ impl POIData {
|
|||
info!("POI data loading complete.");
|
||||
|
||||
Ok(POIData {
|
||||
id,
|
||||
id_buffer,
|
||||
id_offsets,
|
||||
id_lengths,
|
||||
name,
|
||||
category,
|
||||
group,
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ use std::path::Path;
|
|||
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS};
|
||||
use crate::consts::{H3_PRECOMPUTE_MAX, HISTOGRAM_BINS, NAN_U16, QUANT_SCALE};
|
||||
use crate::features::{self, Bounds};
|
||||
|
||||
fn is_numeric_dtype(dtype: &DataType) -> bool {
|
||||
|
|
@ -47,6 +47,38 @@ pub struct Histogram {
|
|||
pub counts: Vec<u64>,
|
||||
}
|
||||
|
||||
impl Histogram {
|
||||
/// Return the bin index for a given value using the outlier-bracket layout.
|
||||
#[cfg(test)]
|
||||
pub fn bin_for_value(&self, value: f32) -> usize {
|
||||
let num_bins = self.counts.len();
|
||||
if value < self.p1 {
|
||||
0
|
||||
} else if value >= self.p99 {
|
||||
num_bins - 1
|
||||
} else {
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
if middle_bins > 0 && self.p99 > self.p1 {
|
||||
let width = (self.p99 - self.p1) / middle_bins as f32;
|
||||
let middle_bin = ((value - self.p1) / width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Width of a single middle bin (bins 1..n-2).
|
||||
#[cfg(test)]
|
||||
pub fn middle_bin_width(&self) -> f32 {
|
||||
let middle_bins = self.counts.len().saturating_sub(2);
|
||||
if middle_bins > 0 && self.p99 > self.p1 {
|
||||
(self.p99 - self.p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FeatureStats {
|
||||
pub slider_min: f32,
|
||||
|
|
@ -60,14 +92,67 @@ pub struct RenovationEvent {
|
|||
pub event: String,
|
||||
}
|
||||
|
||||
/// Lightweight reference to quantization parameters for decoding u16 feature data.
|
||||
pub struct QuantRef<'a> {
|
||||
pub dequant_a: &'a [f32],
|
||||
pub quant_min: &'a [f32],
|
||||
pub quant_range: &'a [f32],
|
||||
pub num_numeric: usize,
|
||||
}
|
||||
|
||||
impl QuantRef<'_> {
|
||||
/// Decode a raw u16 value back to f32.
|
||||
#[inline]
|
||||
pub fn decode(&self, feat_idx: usize, raw: u16) -> f32 {
|
||||
if raw == NAN_U16 {
|
||||
return f32::NAN;
|
||||
}
|
||||
if feat_idx >= self.num_numeric {
|
||||
raw as f32
|
||||
} else {
|
||||
raw as f32 * self.dequant_a[feat_idx] + self.quant_min[feat_idx]
|
||||
}
|
||||
}
|
||||
|
||||
/// Encode a filter minimum bound to u16 (floors to include boundary values).
|
||||
#[inline]
|
||||
pub fn encode_min(&self, feat_idx: usize, value: f32) -> u16 {
|
||||
if !value.is_finite() || self.quant_range[feat_idx] == 0.0 {
|
||||
return 0;
|
||||
}
|
||||
let norm = (value - self.quant_min[feat_idx]) / self.quant_range[feat_idx];
|
||||
(norm * QUANT_SCALE).floor().clamp(0.0, QUANT_SCALE) as u16
|
||||
}
|
||||
|
||||
/// Encode a filter maximum bound to u16 (ceils to include boundary values).
|
||||
#[inline]
|
||||
pub fn encode_max(&self, feat_idx: usize, value: f32) -> u16 {
|
||||
if !value.is_finite() || self.quant_range[feat_idx] == 0.0 {
|
||||
return QUANT_SCALE as u16;
|
||||
}
|
||||
let norm = (value - self.quant_min[feat_idx]) / self.quant_range[feat_idx];
|
||||
(norm * QUANT_SCALE).ceil().clamp(0.0, QUANT_SCALE) as u16
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PropertyData {
|
||||
pub lat: Vec<f32>,
|
||||
pub lon: Vec<f32>,
|
||||
pub feature_names: Vec<String>,
|
||||
pub num_features: usize,
|
||||
/// Number of numeric features (enum features start at this index).
|
||||
pub num_numeric: usize,
|
||||
/// Row-major flat array: feature_data[row * num_features + feat_idx].
|
||||
/// NaN = null. For enum features, stores the index as f32 (0.0, 1.0, etc).
|
||||
pub feature_data: Vec<f32>,
|
||||
/// Quantized to u16. NaN sentinel = u16::MAX (65535).
|
||||
/// Numeric features: encoded via (val - min) / range * 65534.
|
||||
/// Enum features: stored directly as u16 cast of the f32 index.
|
||||
pub feature_data: Vec<u16>,
|
||||
/// Per-feature: range / QUANT_SCALE for fast decode.
|
||||
dequant_a: Vec<f32>,
|
||||
/// Per-feature: minimum value (offset for dequantization).
|
||||
quant_min: Vec<f32>,
|
||||
/// Per-feature: max - min (for encoding filter bounds).
|
||||
quant_range: Vec<f32>,
|
||||
pub feature_stats: Vec<FeatureStats>,
|
||||
/// Contiguous buffer holding all address strings end-to-end.
|
||||
address_buffer: String,
|
||||
|
|
@ -79,7 +164,7 @@ pub struct PropertyData {
|
|||
postcode_interner: lasso::RodeoReader,
|
||||
postcode_keys: Vec<lasso::Spur>,
|
||||
/// For enum features: maps feature index to list of possible string values.
|
||||
/// Index in values list corresponds to the f32 value stored in feature_data.
|
||||
/// Index in values list corresponds to the u16 value stored in feature_data.
|
||||
pub enum_values: rustc_hash::FxHashMap<usize, Vec<String>>,
|
||||
/// Per-row flag: true = construction date is approximate (from EPC band),
|
||||
/// false = exact (from new-build transaction date).
|
||||
|
|
@ -91,10 +176,11 @@ pub struct PropertyData {
|
|||
/// Per-row listing features (key feature bullet points from online listings).
|
||||
/// Only rows with features are present in the map.
|
||||
listing_features: FxHashMap<u32, Vec<String>>,
|
||||
/// Per-row optional string columns from online listings.
|
||||
listing_url: Vec<Option<String>>,
|
||||
property_sub_type: Vec<Option<String>>,
|
||||
price_qualifier: Vec<Option<String>>,
|
||||
/// Sparse per-row optional string columns from online listings.
|
||||
/// Only rows with non-empty values are stored (saves ~1 GB vs Vec<Option<String>>).
|
||||
listing_url: FxHashMap<u32, String>,
|
||||
property_sub_type: FxHashMap<u32, String>,
|
||||
price_qualifier: FxHashMap<u32, String>,
|
||||
}
|
||||
|
||||
impl PropertyData {
|
||||
|
|
@ -139,17 +225,43 @@ impl PropertyData {
|
|||
|
||||
/// Get listing URL for a given row.
|
||||
pub fn listing_url(&self, row: usize) -> Option<&str> {
|
||||
self.listing_url[row].as_deref()
|
||||
self.listing_url.get(&(row as u32)).map(String::as_str)
|
||||
}
|
||||
|
||||
/// Get property sub-type for a given row.
|
||||
pub fn property_sub_type(&self, row: usize) -> Option<&str> {
|
||||
self.property_sub_type[row].as_deref()
|
||||
self.property_sub_type
|
||||
.get(&(row as u32))
|
||||
.map(String::as_str)
|
||||
}
|
||||
|
||||
/// Get price qualifier for a given row.
|
||||
pub fn price_qualifier(&self, row: usize) -> Option<&str> {
|
||||
self.price_qualifier[row].as_deref()
|
||||
self.price_qualifier.get(&(row as u32)).map(String::as_str)
|
||||
}
|
||||
|
||||
/// Decode a single feature value from quantized u16 storage.
|
||||
#[inline]
|
||||
pub fn get_feature(&self, row: usize, feat_idx: usize) -> f32 {
|
||||
let raw = self.feature_data[row * self.num_features + feat_idx];
|
||||
if raw == NAN_U16 {
|
||||
return f32::NAN;
|
||||
}
|
||||
if feat_idx >= self.num_numeric {
|
||||
raw as f32
|
||||
} else {
|
||||
raw as f32 * self.dequant_a[feat_idx] + self.quant_min[feat_idx]
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a QuantRef for passing to aggregation/filter functions.
|
||||
pub fn quant_ref(&self) -> QuantRef<'_> {
|
||||
QuantRef {
|
||||
dequant_a: &self.dequant_a,
|
||||
quant_min: &self.quant_min,
|
||||
quant_range: &self.quant_range,
|
||||
num_numeric: self.num_numeric,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -355,13 +467,12 @@ pub fn precompute_h3(lat: &[f32], lon: &[f32]) -> anyhow::Result<Vec<u64>> {
|
|||
.zip(lon.par_iter())
|
||||
.enumerate()
|
||||
.map(|(i, (&latitude, &longitude))| {
|
||||
let coord = h3o::LatLng::new(latitude as f64, longitude as f64)
|
||||
.unwrap_or_else(|err| {
|
||||
panic!(
|
||||
"Invalid coordinates at row {}: lat={}, lon={}: {}",
|
||||
i, latitude, longitude, err
|
||||
)
|
||||
});
|
||||
let coord = h3o::LatLng::new(latitude as f64, longitude as f64).unwrap_or_else(|err| {
|
||||
panic!(
|
||||
"Invalid coordinates at row {}: lat={}, lon={}: {}",
|
||||
i, latitude, longitude, err
|
||||
)
|
||||
});
|
||||
u64::from(coord.to_cell(h3_res))
|
||||
})
|
||||
.collect();
|
||||
|
|
@ -378,7 +489,10 @@ impl PropertyData {
|
|||
listings_rent_path: &Path,
|
||||
) -> anyhow::Result<Self> {
|
||||
// Load postcode.parquet
|
||||
tracing::info!("Loading postcode features from {:?}", postcode_features_path);
|
||||
tracing::info!(
|
||||
"Loading postcode features from {:?}",
|
||||
postcode_features_path
|
||||
);
|
||||
let postcode_df = LazyFrame::scan_parquet(postcode_features_path, Default::default())
|
||||
.context("Failed to scan postcode parquet")?
|
||||
.collect()
|
||||
|
|
@ -623,6 +737,16 @@ impl PropertyData {
|
|||
})
|
||||
.collect::<anyhow::Result<Vec<_>>>()?;
|
||||
|
||||
// Compute quantization parameters from feature stats (numeric features)
|
||||
let mut quant_min = Vec::with_capacity(num_features);
|
||||
let mut quant_range = Vec::with_capacity(num_features);
|
||||
for stats in &numeric_feature_stats {
|
||||
let min = stats.histogram.min;
|
||||
let max = stats.histogram.max;
|
||||
quant_min.push(min);
|
||||
quant_range.push(if max > min { max - min } else { 0.0 });
|
||||
}
|
||||
|
||||
tracing::info!("Extracting string columns");
|
||||
let extract_string_col = |df: &DataFrame, name: &str| -> anyhow::Result<Vec<String>> {
|
||||
let column = df
|
||||
|
|
@ -928,19 +1052,34 @@ impl PropertyData {
|
|||
map
|
||||
};
|
||||
|
||||
// Permute optional string columns
|
||||
let listing_url: Vec<Option<String>> = perm
|
||||
.iter()
|
||||
.map(|&old_row| listing_url_raw[old_row as usize].clone())
|
||||
.collect();
|
||||
let property_sub_type: Vec<Option<String>> = perm
|
||||
.iter()
|
||||
.map(|&old_row| property_sub_type_raw[old_row as usize].clone())
|
||||
.collect();
|
||||
let price_qualifier: Vec<Option<String>> = perm
|
||||
.iter()
|
||||
.map(|&old_row| price_qualifier_raw[old_row as usize].clone())
|
||||
.collect();
|
||||
// Permute optional string columns into sparse HashMaps
|
||||
let listing_url: FxHashMap<u32, String> = {
|
||||
let mut map = FxHashMap::default();
|
||||
for (new_row, &old_row) in perm.iter().enumerate() {
|
||||
if let Some(ref s) = listing_url_raw[old_row as usize] {
|
||||
map.insert(new_row as u32, s.clone());
|
||||
}
|
||||
}
|
||||
map
|
||||
};
|
||||
let property_sub_type: FxHashMap<u32, String> = {
|
||||
let mut map = FxHashMap::default();
|
||||
for (new_row, &old_row) in perm.iter().enumerate() {
|
||||
if let Some(ref s) = property_sub_type_raw[old_row as usize] {
|
||||
map.insert(new_row as u32, s.clone());
|
||||
}
|
||||
}
|
||||
map
|
||||
};
|
||||
let price_qualifier: FxHashMap<u32, String> = {
|
||||
let mut map = FxHashMap::default();
|
||||
for (new_row, &old_row) in perm.iter().enumerate() {
|
||||
if let Some(ref s) = price_qualifier_raw[old_row as usize] {
|
||||
map.insert(new_row as u32, s.clone());
|
||||
}
|
||||
}
|
||||
map
|
||||
};
|
||||
|
||||
// Build enum_values map: feature_index -> list of string values
|
||||
let mut enum_values: rustc_hash::FxHashMap<usize, Vec<String>> =
|
||||
|
|
@ -967,24 +1106,47 @@ impl PropertyData {
|
|||
counts: vec![0; num_values.max(1)],
|
||||
},
|
||||
});
|
||||
// Enum features: not quantized, stored directly as u16
|
||||
quant_min.push(0.0);
|
||||
quant_range.push(0.0);
|
||||
}
|
||||
let dequant_a: Vec<f32> = quant_range
|
||||
.iter()
|
||||
.map(|&r| if r > 0.0 { r / QUANT_SCALE } else { 0.0 })
|
||||
.collect();
|
||||
|
||||
// Transpose to row-major AND apply spatial permutation in one pass.
|
||||
// Combines numeric and enum features into a single feature_data array.
|
||||
tracing::info!("Transposing to row-major layout (spatially sorted)");
|
||||
let mut feature_data = vec![f32::NAN; row_count * num_features];
|
||||
// Combines numeric and enum features into a single feature_data array, quantized to u16.
|
||||
tracing::info!("Transposing to row-major layout (spatially sorted, quantized to u16)");
|
||||
let mut feature_data = vec![NAN_U16; row_count * num_features];
|
||||
feature_data
|
||||
.par_chunks_mut(num_features)
|
||||
.enumerate()
|
||||
.for_each(|(new_row, row_slice)| {
|
||||
let old_index = perm[new_row] as usize;
|
||||
// Numeric features
|
||||
// Numeric features: quantize to u16
|
||||
for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
|
||||
row_slice[feat_idx] = col_vec[old_index];
|
||||
let value = col_vec[old_index];
|
||||
row_slice[feat_idx] = if value.is_finite() {
|
||||
let range = quant_range[feat_idx];
|
||||
if range > 0.0 {
|
||||
let normalized = (value - quant_min[feat_idx]) / range;
|
||||
(normalized * QUANT_SCALE).round().clamp(0.0, QUANT_SCALE) as u16
|
||||
} else {
|
||||
0
|
||||
}
|
||||
} else {
|
||||
NAN_U16
|
||||
};
|
||||
}
|
||||
// Enum features (stored as f32 indices)
|
||||
// Enum features: store as u16 directly
|
||||
for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
|
||||
row_slice[num_numeric + enum_idx] = encoded[old_index];
|
||||
let value = encoded[old_index];
|
||||
row_slice[num_numeric + enum_idx] = if value.is_finite() {
|
||||
value as u16
|
||||
} else {
|
||||
NAN_U16
|
||||
};
|
||||
}
|
||||
});
|
||||
|
||||
|
|
@ -995,7 +1157,11 @@ impl PropertyData {
|
|||
lon,
|
||||
feature_names,
|
||||
num_features,
|
||||
num_numeric,
|
||||
feature_data,
|
||||
dequant_a,
|
||||
quant_min,
|
||||
quant_range,
|
||||
feature_stats,
|
||||
address_buffer,
|
||||
address_offsets,
|
||||
|
|
|
|||
|
|
@ -124,10 +124,7 @@ impl TravelTimeStore {
|
|||
if file_name.ends_with(".parquet") {
|
||||
let file_stem = file_name.trim_end_matches(".parquet");
|
||||
let slug = strip_numeric_prefix(file_stem).to_string();
|
||||
slug_to_file.insert(
|
||||
(mode.clone(), slug.clone()),
|
||||
file_stem.to_string(),
|
||||
);
|
||||
slug_to_file.insert((mode.clone(), slug.clone()), file_stem.to_string());
|
||||
slugs.insert(slug);
|
||||
}
|
||||
}
|
||||
|
|
@ -207,10 +204,7 @@ impl TravelTimeStore {
|
|||
for (i, (pc, min)) in postcodes.into_iter().zip(minutes.into_iter()).enumerate() {
|
||||
if let (Some(pc), Some(min)) = (pc, min) {
|
||||
let best_min = best.as_ref().and_then(|b| b.get(i));
|
||||
let journey = journeys
|
||||
.as_ref()
|
||||
.and_then(|j| j.get(i))
|
||||
.map(Arc::from);
|
||||
let journey = journeys.as_ref().and_then(|j| j.get(i)).map(Arc::from);
|
||||
map.insert(
|
||||
pc.to_string(),
|
||||
TravelDataRow {
|
||||
|
|
@ -274,10 +268,15 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn strip_numeric_prefix_basic() {
|
||||
assert_eq!(strip_numeric_prefix("000000-bank-tube-station"), "bank-tube-station");
|
||||
assert_eq!(
|
||||
strip_numeric_prefix("000000-bank-tube-station"),
|
||||
"bank-tube-station"
|
||||
);
|
||||
assert_eq!(strip_numeric_prefix("000123-abbey-hey"), "abbey-hey");
|
||||
assert_eq!(strip_numeric_prefix("bank-tube-station"), "bank-tube-station");
|
||||
assert_eq!(
|
||||
strip_numeric_prefix("bank-tube-station"),
|
||||
"bank-tube-station"
|
||||
);
|
||||
assert_eq!(strip_numeric_prefix("london"), "london");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue