Format rust
This commit is contained in:
parent
0fde087c3d
commit
f60fbec9d4
5 changed files with 191 additions and 94 deletions
|
|
@ -1,18 +1,13 @@
|
|||
use polars::prelude::*;
|
||||
use polars::lazy::frame::LazyFrame;
|
||||
use polars::prelude::*;
|
||||
use rayon::prelude::*;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::consts::{FEATURE_PERCENTILE_LOW, FEATURE_PERCENTILE_HIGH, HISTOGRAM_BINS, H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX};
|
||||
|
||||
/// Columns to exclude from feature discovery
|
||||
const EXCLUDED_COLUMNS: &[&str] = &["lat", "lon"];
|
||||
|
||||
/// H3 valid resolution range (0-15)
|
||||
pub const MIN_RESOLUTION: u8 = 0;
|
||||
pub const MAX_RESOLUTION: u8 = 15;
|
||||
pub const DEFAULT_RESOLUTION: u8 = 8;
|
||||
use crate::consts::{
|
||||
EXCLUDED_COLUMNS, FEATURE_PERCENTILE_HIGH, FEATURE_PERCENTILE_LOW, H3_PRECOMPUTE_MAX,
|
||||
H3_PRECOMPUTE_MIN, HISTOGRAM_BINS,
|
||||
};
|
||||
|
||||
/// Returns true if the polars DataType is numeric (integer or float)
|
||||
fn is_numeric_dtype(dtype: &DataType) -> bool {
|
||||
|
|
@ -76,7 +71,13 @@ pub struct PropertyData {
|
|||
|
||||
/// Approximate a percentile from a histogram using linear interpolation.
|
||||
/// `p` is in [0, 100]. `total` is the sum of all bin counts.
|
||||
fn percentile_from_histogram(counts: &[u64], min: f64, bin_width: f64, total: usize, p: f64) -> f64 {
|
||||
fn percentile_from_histogram(
|
||||
counts: &[u64],
|
||||
min: f64,
|
||||
bin_width: f64,
|
||||
total: usize,
|
||||
p: f64,
|
||||
) -> f64 {
|
||||
let target = (p / 100.0) * (total as f64 - 1.0);
|
||||
let mut cumulative = 0u64;
|
||||
for (i, &c) in counts.iter().enumerate() {
|
||||
|
|
@ -104,8 +105,12 @@ fn compute_feature_stats(vals: &[f64]) -> FeatureStats {
|
|||
let mut count = 0usize;
|
||||
for &v in vals {
|
||||
if !v.is_nan() {
|
||||
if v < min { min = v; }
|
||||
if v > max { max = v; }
|
||||
if v < min {
|
||||
min = v;
|
||||
}
|
||||
if v > max {
|
||||
max = v;
|
||||
}
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
|
|
@ -222,8 +227,12 @@ impl PropertyData {
|
|||
|
||||
// Add string columns (using actual column names from parquet)
|
||||
let string_cols = vec![
|
||||
"pp_address", "postcode", "pp_property_type", "built_form",
|
||||
"current_energy_rating", "potential_energy_rating"
|
||||
"pp_address",
|
||||
"postcode",
|
||||
"pp_property_type",
|
||||
"built_form",
|
||||
"current_energy_rating",
|
||||
"potential_energy_rating",
|
||||
];
|
||||
|
||||
// Build selection with proper casting
|
||||
|
|
@ -256,10 +265,20 @@ impl PropertyData {
|
|||
|
||||
// Extract lat/lon using bulk iterator
|
||||
let lat_series = df.column("lat").unwrap().cast(&DataType::Float64).unwrap();
|
||||
let lat: Vec<f64> = lat_series.f64().unwrap().into_iter().map(|v| v.unwrap_or(0.0)).collect();
|
||||
let lat: Vec<f64> = lat_series
|
||||
.f64()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or(0.0))
|
||||
.collect();
|
||||
|
||||
let lon_series = df.column("lon").unwrap().cast(&DataType::Float64).unwrap();
|
||||
let lon: Vec<f64> = lon_series.f64().unwrap().into_iter().map(|v| v.unwrap_or(0.0)).collect();
|
||||
let lon: Vec<f64> = lon_series
|
||||
.f64()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or(0.0))
|
||||
.collect();
|
||||
|
||||
// Extract feature columns (column-major, for cache-friendly histogram computation)
|
||||
eprintln!("Extracting feature columns...");
|
||||
|
|
@ -281,8 +300,10 @@ impl PropertyData {
|
|||
eprintln!(
|
||||
" {}: p{}={:.2}, p{}={:.2}, {} bins",
|
||||
feature_names[i],
|
||||
FEATURE_PERCENTILE_LOW, stats.p_low,
|
||||
FEATURE_PERCENTILE_HIGH, stats.p_high,
|
||||
FEATURE_PERCENTILE_LOW,
|
||||
stats.p_low,
|
||||
FEATURE_PERCENTILE_HIGH,
|
||||
stats.p_high,
|
||||
stats.histogram.counts.len()
|
||||
);
|
||||
stats
|
||||
|
|
@ -292,40 +313,66 @@ impl PropertyData {
|
|||
// Extract string columns (before permutation)
|
||||
eprintln!("Extracting string columns...");
|
||||
let address_raw: Vec<String> = if let Ok(col) = df.column("pp_address") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
col.str()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let postcode_raw: Vec<String> = if let Ok(col) = df.column("postcode") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
col.str()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let property_type_raw: Vec<String> = if let Ok(col) = df.column("pp_property_type") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
col.str()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let built_form_raw: Vec<String> = if let Ok(col) = df.column("built_form") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
col.str()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let current_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("current_energy_rating") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
let current_energy_rating_raw: Vec<String> =
|
||||
if let Ok(col) = df.column("current_energy_rating") {
|
||||
col.str()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let potential_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("potential_energy_rating") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
let potential_energy_rating_raw: Vec<String> =
|
||||
if let Ok(col) = df.column("potential_energy_rating") {
|
||||
col.str()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
// Sort all rows by spatial locality so that grid queries access
|
||||
// contiguous memory (sequential reads instead of random DRAM accesses).
|
||||
|
|
@ -349,12 +396,30 @@ impl PropertyData {
|
|||
let lon: Vec<f64> = perm.iter().map(|&i| lon[i as usize]).collect();
|
||||
|
||||
// Apply permutation to string columns
|
||||
let address: Vec<String> = perm.iter().map(|&i| address_raw[i as usize].clone()).collect();
|
||||
let postcode: Vec<String> = perm.iter().map(|&i| postcode_raw[i as usize].clone()).collect();
|
||||
let property_type: Vec<String> = perm.iter().map(|&i| property_type_raw[i as usize].clone()).collect();
|
||||
let built_form: Vec<String> = perm.iter().map(|&i| built_form_raw[i as usize].clone()).collect();
|
||||
let current_energy_rating: Vec<String> = perm.iter().map(|&i| current_energy_rating_raw[i as usize].clone()).collect();
|
||||
let potential_energy_rating: Vec<String> = perm.iter().map(|&i| potential_energy_rating_raw[i as usize].clone()).collect();
|
||||
let address: Vec<String> = perm
|
||||
.iter()
|
||||
.map(|&i| address_raw[i as usize].clone())
|
||||
.collect();
|
||||
let postcode: Vec<String> = perm
|
||||
.iter()
|
||||
.map(|&i| postcode_raw[i as usize].clone())
|
||||
.collect();
|
||||
let property_type: Vec<String> = perm
|
||||
.iter()
|
||||
.map(|&i| property_type_raw[i as usize].clone())
|
||||
.collect();
|
||||
let built_form: Vec<String> = perm
|
||||
.iter()
|
||||
.map(|&i| built_form_raw[i as usize].clone())
|
||||
.collect();
|
||||
let current_energy_rating: Vec<String> = perm
|
||||
.iter()
|
||||
.map(|&i| current_energy_rating_raw[i as usize].clone())
|
||||
.collect();
|
||||
let potential_energy_rating: Vec<String> = perm
|
||||
.iter()
|
||||
.map(|&i| potential_energy_rating_raw[i as usize].clone())
|
||||
.collect();
|
||||
|
||||
// Transpose to row-major AND apply spatial permutation in one pass.
|
||||
// Result: all features for one row are contiguous, and spatially
|
||||
|
|
@ -422,7 +487,8 @@ impl POIData {
|
|||
eprintln!("Loaded {} POIs", row_count);
|
||||
|
||||
// Extract columns
|
||||
let id: Vec<String> = df.column("id")
|
||||
let id: Vec<String> = df
|
||||
.column("id")
|
||||
.unwrap()
|
||||
.str()
|
||||
.unwrap()
|
||||
|
|
@ -430,7 +496,8 @@ impl POIData {
|
|||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect();
|
||||
|
||||
let name: Vec<String> = df.column("name")
|
||||
let name: Vec<String> = df
|
||||
.column("name")
|
||||
.unwrap()
|
||||
.str()
|
||||
.unwrap()
|
||||
|
|
@ -438,7 +505,8 @@ impl POIData {
|
|||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect();
|
||||
|
||||
let category: Vec<String> = df.column("category")
|
||||
let category: Vec<String> = df
|
||||
.column("category")
|
||||
.unwrap()
|
||||
.str()
|
||||
.unwrap()
|
||||
|
|
@ -446,7 +514,8 @@ impl POIData {
|
|||
.map(|v| v.unwrap_or("").to_string())
|
||||
.collect();
|
||||
|
||||
let lat: Vec<f64> = df.column("lat")
|
||||
let lat: Vec<f64> = df
|
||||
.column("lat")
|
||||
.unwrap()
|
||||
.f64()
|
||||
.unwrap()
|
||||
|
|
@ -454,7 +523,8 @@ impl POIData {
|
|||
.map(|v| v.unwrap_or(0.0))
|
||||
.collect();
|
||||
|
||||
let lng: Vec<f64> = df.column("lng")
|
||||
let lng: Vec<f64> = df
|
||||
.column("lng")
|
||||
.unwrap()
|
||||
.f64()
|
||||
.unwrap()
|
||||
|
|
@ -462,7 +532,8 @@ impl POIData {
|
|||
.map(|v| v.unwrap_or(0.0))
|
||||
.collect();
|
||||
|
||||
let emoji: Vec<String> = df.column("emoji")
|
||||
let emoji: Vec<String> = df
|
||||
.column("emoji")
|
||||
.unwrap()
|
||||
.str()
|
||||
.unwrap()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue