Add property listing
This commit is contained in:
parent
51967fa880
commit
85f5770e09
3 changed files with 254 additions and 7 deletions
|
|
@ -6,7 +6,7 @@ use std::path::Path;
|
|||
|
||||
use crate::consts::{FEATURE_PERCENTILE_LOW, FEATURE_PERCENTILE_HIGH, HISTOGRAM_BINS, H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX};
|
||||
|
||||
/// Columns to exclude from feature discovery (not numeric features)
|
||||
/// Columns to exclude from feature discovery
|
||||
const EXCLUDED_COLUMNS: &[&str] = &["lat", "lon"];
|
||||
|
||||
/// H3 valid resolution range (0-15)
|
||||
|
|
@ -65,6 +65,13 @@ pub struct PropertyData {
|
|||
pub feature_data: Vec<f64>,
|
||||
/// Precomputed stats (percentiles + histogram) for each feature
|
||||
pub feature_stats: Vec<FeatureStats>,
|
||||
/// String fields for property details
|
||||
pub address: Vec<String>,
|
||||
pub postcode: Vec<String>,
|
||||
pub property_type: Vec<String>,
|
||||
pub built_form: Vec<String>,
|
||||
pub current_energy_rating: Vec<String>,
|
||||
pub potential_energy_rating: Vec<String>,
|
||||
}
|
||||
|
||||
/// Approximate a percentile from a histogram using linear interpolation.
|
||||
|
|
@ -213,14 +220,34 @@ impl PropertyData {
|
|||
let mut cols_needed: Vec<String> = vec!["lat".into(), "lon".into()];
|
||||
cols_needed.extend(feature_names.iter().cloned());
|
||||
|
||||
// Add string columns (using actual column names from parquet)
|
||||
let string_cols = vec![
|
||||
"pp_address", "postcode", "pp_property_type", "built_form",
|
||||
"current_energy_rating", "potential_energy_rating"
|
||||
];
|
||||
|
||||
// Build selection with proper casting
|
||||
let mut select_exprs: Vec<polars::prelude::Expr> = vec![];
|
||||
|
||||
// lat/lon as f64
|
||||
select_exprs.push(col("lat").cast(DataType::Float64));
|
||||
select_exprs.push(col("lon").cast(DataType::Float64));
|
||||
|
||||
// numeric features as f64
|
||||
for name in &feature_names {
|
||||
select_exprs.push(col(name.as_str()).cast(DataType::Float64));
|
||||
}
|
||||
|
||||
// string columns as string (check if they exist in schema)
|
||||
for &s_col in &string_cols {
|
||||
if schema.get(s_col).is_some() {
|
||||
select_exprs.push(col(s_col).cast(DataType::String));
|
||||
}
|
||||
}
|
||||
|
||||
let df = LazyFrame::scan_parquet(parquet_path, Default::default())
|
||||
.expect("Failed to scan parquet")
|
||||
.select(
|
||||
cols_needed
|
||||
.iter()
|
||||
.map(|c| col(c.as_str()).cast(DataType::Float64))
|
||||
.collect::<Vec<_>>(),
|
||||
)
|
||||
.select(select_exprs)
|
||||
.collect()
|
||||
.expect("Failed to read parquet");
|
||||
|
||||
|
|
@ -262,6 +289,44 @@ impl PropertyData {
|
|||
})
|
||||
.collect();
|
||||
|
||||
// Extract string columns (before permutation)
|
||||
eprintln!("Extracting string columns...");
|
||||
let address_raw: Vec<String> = if let Ok(col) = df.column("pp_address") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let postcode_raw: Vec<String> = if let Ok(col) = df.column("postcode") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let property_type_raw: Vec<String> = if let Ok(col) = df.column("pp_property_type") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let built_form_raw: Vec<String> = if let Ok(col) = df.column("built_form") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let current_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("current_energy_rating") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
let potential_energy_rating_raw: Vec<String> = if let Ok(col) = df.column("potential_energy_rating") {
|
||||
col.str().unwrap().into_iter().map(|v| v.unwrap_or("").to_string()).collect()
|
||||
} else {
|
||||
vec![String::new(); row_count]
|
||||
};
|
||||
|
||||
// Sort all rows by spatial locality so that grid queries access
|
||||
// contiguous memory (sequential reads instead of random DRAM accesses).
|
||||
// Uses the same 0.01° grid cell as the spatial index for the sort key.
|
||||
|
|
@ -283,6 +348,14 @@ impl PropertyData {
|
|||
let lat: Vec<f64> = perm.iter().map(|&i| lat[i as usize]).collect();
|
||||
let lon: Vec<f64> = perm.iter().map(|&i| lon[i as usize]).collect();
|
||||
|
||||
// Apply permutation to string columns
|
||||
let address: Vec<String> = perm.iter().map(|&i| address_raw[i as usize].clone()).collect();
|
||||
let postcode: Vec<String> = perm.iter().map(|&i| postcode_raw[i as usize].clone()).collect();
|
||||
let property_type: Vec<String> = perm.iter().map(|&i| property_type_raw[i as usize].clone()).collect();
|
||||
let built_form: Vec<String> = perm.iter().map(|&i| built_form_raw[i as usize].clone()).collect();
|
||||
let current_energy_rating: Vec<String> = perm.iter().map(|&i| current_energy_rating_raw[i as usize].clone()).collect();
|
||||
let potential_energy_rating: Vec<String> = perm.iter().map(|&i| potential_energy_rating_raw[i as usize].clone()).collect();
|
||||
|
||||
// Transpose to row-major AND apply spatial permutation in one pass.
|
||||
// Result: all features for one row are contiguous, and spatially
|
||||
// nearby rows are adjacent in memory.
|
||||
|
|
@ -305,6 +378,12 @@ impl PropertyData {
|
|||
num_features,
|
||||
feature_data,
|
||||
feature_stats,
|
||||
address,
|
||||
postcode,
|
||||
property_type,
|
||||
built_form,
|
||||
current_energy_rating,
|
||||
potential_energy_rating,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue