Format rust

This commit is contained in:
Andras Schmelczer 2026-01-31 13:57:43 +00:00
parent 0fde087c3d
commit f60fbec9d4
5 changed files with 191 additions and 94 deletions

View file

@ -1,18 +1,13 @@
use polars::prelude::*;
use polars::lazy::frame::LazyFrame;
use polars::prelude::*;
use rayon::prelude::*;
use serde::Serialize;
use std::path::Path;
use crate::consts::{FEATURE_PERCENTILE_LOW, FEATURE_PERCENTILE_HIGH, HISTOGRAM_BINS, H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX};
/// Columns to exclude from feature discovery
const EXCLUDED_COLUMNS: &[&str] = &["lat", "lon"];
/// H3 valid resolution range (0-15)
pub const MIN_RESOLUTION: u8 = 0;
pub const MAX_RESOLUTION: u8 = 15;
pub const DEFAULT_RESOLUTION: u8 = 8;
use crate::consts::{
EXCLUDED_COLUMNS, FEATURE_PERCENTILE_HIGH, FEATURE_PERCENTILE_LOW, H3_PRECOMPUTE_MAX,
H3_PRECOMPUTE_MIN, HISTOGRAM_BINS,
};
/// Returns true if the polars DataType is numeric (integer or float)
fn is_numeric_dtype(dtype: &DataType) -> bool {
@ -76,7 +71,13 @@ pub struct PropertyData {
/// Approximate a percentile from a histogram using linear interpolation.
/// `p` is in [0, 100]. `total` is the sum of all bin counts.
fn percentile_from_histogram(counts: &[u64], min: f64, bin_width: f64, total: usize, p: f64) -> f64 {
fn percentile_from_histogram(
counts: &[u64],
min: f64,
bin_width: f64,
total: usize,
p: f64,
) -> f64 {
let target = (p / 100.0) * (total as f64 - 1.0);
let mut cumulative = 0u64;
for (i, &c) in counts.iter().enumerate() {
@ -104,8 +105,12 @@ fn compute_feature_stats(vals: &[f64]) -> FeatureStats {
let mut count = 0usize;
for &v in vals {
if !v.is_nan() {
if v < min { min = v; }
if v > max { max = v; }
if v < min {
min = v;
}
if v > max {
max = v;
}
count += 1;
}
}
@ -222,8 +227,12 @@ impl PropertyData {
// Add string columns (using actual column names from parquet)
let string_cols = vec![
"pp_address", "postcode", "pp_property_type", "built_form",
"current_energy_rating", "potential_energy_rating"
"pp_address",
"postcode",
"pp_property_type",
"built_form",
"current_energy_rating",
"potential_energy_rating",
];
// Build selection with proper casting
@ -256,10 +265,20 @@ impl PropertyData {
// Extract lat/lon using bulk iterator
let lat_series = df.column("lat").unwrap().cast(&DataType::Float64).unwrap();
let lat: Vec<f64> = lat_series.f64().unwrap().into_iter().map(|v| v.unwrap_or(0.0)).collect();
let lat: Vec<f64> = lat_series
.f64()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or(0.0))
.collect();
let lon_series = df.column("lon").unwrap().cast(&DataType::Float64).unwrap();
let lon: Vec<f64> = lon_series.f64().unwrap().into_iter().map(|v| v.unwrap_or(0.0)).collect();
let lon: Vec<f64> = lon_series
.f64()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or(0.0))
.collect();
// Extract feature columns (column-major, for cache-friendly histogram computation)
eprintln!("Extracting feature columns...");
@ -281,8 +300,10 @@ impl PropertyData {
eprintln!(
" {}: p{}={:.2}, p{}={:.2}, {} bins",
feature_names[i],
FEATURE_PERCENTILE_LOW, stats.p_low,
FEATURE_PERCENTILE_HIGH, stats.p_high,
FEATURE_PERCENTILE_LOW,
stats.p_low,
FEATURE_PERCENTILE_HIGH,
stats.p_high,
stats.histogram.counts.len()
);
stats
@ -292,40 +313,66 @@ impl PropertyData {
// Extract string columns (before permutation)
eprintln!("Extracting string columns...");
let address_raw: Vec<String> = if let Ok(col) = df.column("pp_address") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
col.str()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or("").to_string())
.collect()
} else {
vec![String::new(); row_count]
};
let postcode_raw: Vec<String> = if let Ok(col) = df.column("postcode") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
col.str()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or("").to_string())
.collect()
} else {
vec![String::new(); row_count]
};
let property_type_raw: Vec<String> = if let Ok(col) = df.column("pp_property_type") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
col.str()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or("").to_string())
.collect()
} else {
vec![String::new(); row_count]
};
let built_form_raw: Vec<String> = if let Ok(col) = df.column("built_form") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
col.str()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or("").to_string())
.collect()
} else {
vec![String::new(); row_count]
};
let current_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("current_energy_rating") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let current_energy_rating_raw: Vec<String> =
if let Ok(col) = df.column("current_energy_rating") {
col.str()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or("").to_string())
.collect()
} else {
vec![String::new(); row_count]
};
let potential_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("potential_energy_rating") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let potential_energy_rating_raw: Vec<String> =
if let Ok(col) = df.column("potential_energy_rating") {
col.str()
.unwrap()
.into_iter()
.map(|v| v.unwrap_or("").to_string())
.collect()
} else {
vec![String::new(); row_count]
};
// Sort all rows by spatial locality so that grid queries access
// contiguous memory (sequential reads instead of random DRAM accesses).
@ -349,12 +396,30 @@ impl PropertyData {
let lon: Vec<f64> = perm.iter().map(|&i| lon[i as usize]).collect();
// Apply permutation to string columns
let address: Vec<String> = perm.iter().map(|&i| address_raw[i as usize].clone()).collect();
let postcode: Vec<String> = perm.iter().map(|&i| postcode_raw[i as usize].clone()).collect();
let property_type: Vec<String> = perm.iter().map(|&i| property_type_raw[i as usize].clone()).collect();
let built_form: Vec<String> = perm.iter().map(|&i| built_form_raw[i as usize].clone()).collect();
let current_energy_rating: Vec<String> = perm.iter().map(|&i| current_energy_rating_raw[i as usize].clone()).collect();
let potential_energy_rating: Vec<String> = perm.iter().map(|&i| potential_energy_rating_raw[i as usize].clone()).collect();
let address: Vec<String> = perm
.iter()
.map(|&i| address_raw[i as usize].clone())
.collect();
let postcode: Vec<String> = perm
.iter()
.map(|&i| postcode_raw[i as usize].clone())
.collect();
let property_type: Vec<String> = perm
.iter()
.map(|&i| property_type_raw[i as usize].clone())
.collect();
let built_form: Vec<String> = perm
.iter()
.map(|&i| built_form_raw[i as usize].clone())
.collect();
let current_energy_rating: Vec<String> = perm
.iter()
.map(|&i| current_energy_rating_raw[i as usize].clone())
.collect();
let potential_energy_rating: Vec<String> = perm
.iter()
.map(|&i| potential_energy_rating_raw[i as usize].clone())
.collect();
// Transpose to row-major AND apply spatial permutation in one pass.
// Result: all features for one row are contiguous, and spatially
@ -422,7 +487,8 @@ impl POIData {
eprintln!("Loaded {} POIs", row_count);
// Extract columns
let id: Vec<String> = df.column("id")
let id: Vec<String> = df
.column("id")
.unwrap()
.str()
.unwrap()
@ -430,7 +496,8 @@ impl POIData {
.map(|v| v.unwrap_or("").to_string())
.collect();
let name: Vec<String> = df.column("name")
let name: Vec<String> = df
.column("name")
.unwrap()
.str()
.unwrap()
@ -438,7 +505,8 @@ impl POIData {
.map(|v| v.unwrap_or("").to_string())
.collect();
let category: Vec<String> = df.column("category")
let category: Vec<String> = df
.column("category")
.unwrap()
.str()
.unwrap()
@ -446,7 +514,8 @@ impl POIData {
.map(|v| v.unwrap_or("").to_string())
.collect();
let lat: Vec<f64> = df.column("lat")
let lat: Vec<f64> = df
.column("lat")
.unwrap()
.f64()
.unwrap()
@ -454,7 +523,8 @@ impl POIData {
.map(|v| v.unwrap_or(0.0))
.collect();
let lng: Vec<f64> = df.column("lng")
let lng: Vec<f64> = df
.column("lng")
.unwrap()
.f64()
.unwrap()
@ -462,7 +532,8 @@ impl POIData {
.map(|v| v.unwrap_or(0.0))
.collect();
let emoji: Vec<String> = df.column("emoji")
let emoji: Vec<String> = df
.column("emoji")
.unwrap()
.str()
.unwrap()