Add property listing

This commit is contained in:
Andras Schmelczer 2026-01-31 12:50:01 +00:00
parent 51967fa880
commit 85f5770e09
3 changed files with 254 additions and 7 deletions

View file

@ -6,7 +6,7 @@ use std::path::Path;
use crate::consts::{FEATURE_PERCENTILE_LOW, FEATURE_PERCENTILE_HIGH, HISTOGRAM_BINS, H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX};
/// Columns to exclude from feature discovery (not numeric features)
/// Columns to exclude from feature discovery
const EXCLUDED_COLUMNS: &[&str] = &["lat", "lon"];
/// H3 valid resolution range (0-15)
@ -65,6 +65,13 @@ pub struct PropertyData {
pub feature_data: Vec<f64>,
/// Precomputed stats (percentiles + histogram) for each feature
pub feature_stats: Vec<FeatureStats>,
/// String fields for property details
pub address: Vec<String>,
pub postcode: Vec<String>,
pub property_type: Vec<String>,
pub built_form: Vec<String>,
pub current_energy_rating: Vec<String>,
pub potential_energy_rating: Vec<String>,
}
/// Approximate a percentile from a histogram using linear interpolation.
@ -213,14 +220,34 @@ impl PropertyData {
let mut cols_needed: Vec<String> = vec!["lat".into(), "lon".into()];
cols_needed.extend(feature_names.iter().cloned());
// Add string columns (using actual column names from parquet)
let string_cols = vec![
"pp_address", "postcode", "pp_property_type", "built_form",
"current_energy_rating", "potential_energy_rating"
];
// Build selection with proper casting
let mut select_exprs: Vec<polars::prelude::Expr> = vec![];
// lat/lon as f64
select_exprs.push(col("lat").cast(DataType::Float64));
select_exprs.push(col("lon").cast(DataType::Float64));
// numeric features as f64
for name in &feature_names {
select_exprs.push(col(name.as_str()).cast(DataType::Float64));
}
// string columns as string (check if they exist in schema)
for &s_col in &string_cols {
if schema.get(s_col).is_some() {
select_exprs.push(col(s_col).cast(DataType::String));
}
}
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
.expect("Failed to scan parquet")
.select(
cols_needed
.iter()
.map(|c| col(c.as_str()).cast(DataType::Float64))
.collect::<Vec<_>>(),
)
.select(select_exprs)
.collect()
.expect("Failed to read parquet");
@ -262,6 +289,44 @@ impl PropertyData {
})
.collect();
// Extract string columns (before permutation)
eprintln!("Extracting string columns...");
let address_raw: Vec<String> = if let Ok(col) = df.column("pp_address") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let postcode_raw: Vec<String> = if let Ok(col) = df.column("postcode") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let property_type_raw: Vec<String> = if let Ok(col) = df.column("pp_property_type") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let built_form_raw: Vec<String> = if let Ok(col) = df.column("built_form") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let current_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("current_energy_rating") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
let potential_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("potential_energy_rating") {
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
} else {
vec![String::new(); row_count]
};
// Sort all rows by spatial locality so that grid queries access
// contiguous memory (sequential reads instead of random DRAM accesses).
// Uses the same 0.01° grid cell as the spatial index for the sort key.
@ -283,6 +348,14 @@ impl PropertyData {
let lat: Vec<f64> = perm.iter().map(|&i| lat[i as usize]).collect();
let lon: Vec<f64> = perm.iter().map(|&i| lon[i as usize]).collect();
// Apply permutation to string columns
let address: Vec<String> = perm.iter().map(|&i| address_raw[i as usize].clone()).collect();
let postcode: Vec<String> = perm.iter().map(|&i| postcode_raw[i as usize].clone()).collect();
let property_type: Vec<String> = perm.iter().map(|&i| property_type_raw[i as usize].clone()).collect();
let built_form: Vec<String> = perm.iter().map(|&i| built_form_raw[i as usize].clone()).collect();
let current_energy_rating: Vec<String> = perm.iter().map(|&i| current_energy_rating_raw[i as usize].clone()).collect();
let potential_energy_rating: Vec<String> = perm.iter().map(|&i| potential_energy_rating_raw[i as usize].clone()).collect();
// Transpose to row-major AND apply spatial permutation in one pass.
// Result: all features for one row are contiguous, and spatially
// nearby rows are adjacent in memory.
@ -305,6 +378,12 @@ impl PropertyData {
num_features,
feature_data,
feature_stats,
address,
postcode,
property_type,
built_form,
current_energy_rating,
potential_energy_rating,
}
}
}