Improve perf

This commit is contained in:
Andras Schmelczer 2026-02-22 23:14:36 +00:00
parent 8032011708
commit 48f2c97487

View file

@ -656,7 +656,7 @@ impl PropertyData {
tracing::info!("Extracting numeric feature columns"); tracing::info!("Extracting numeric feature columns");
let numeric_col_major: Vec<Vec<f32>> = numeric_names let numeric_col_major: Vec<Vec<f32>> = numeric_names
.iter() .par_iter()
.map(|name| { .map(|name| {
let column = df let column = df
.column(name) .column(name)
@ -733,12 +733,11 @@ impl PropertyData {
tracing::info!("Building enum features"); tracing::info!("Building enum features");
// enum_col_major: Vec<(values_list, encoded_as_f32)> // enum_col_major: Vec<(values_list, encoded_as_f32)>
let mut enum_col_major: Vec<(Vec<String>, Vec<f32>)> = Vec::new(); let enum_col_major: Vec<(Vec<String>, Vec<f32>)> = enum_names
for &name in &enum_names { .par_iter()
if let Ok(column_data) = df.column(name) { .filter_map(|&name| {
let string_column = column_data let column_data = df.column(name).ok()?;
.str() let string_column = column_data.str().ok()?;
.with_context(|| format!("Enum column '{name}' is not a string column"))?;
let unique_set: std::collections::HashSet<String> = string_column let unique_set: std::collections::HashSet<String> = string_column
.into_iter() .into_iter()
.filter_map(|value| { .filter_map(|value| {
@ -795,9 +794,9 @@ impl PropertyData {
.collect(); .collect();
tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32"); tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32");
enum_col_major.push((unique, encoded)); Some((unique, encoded))
} })
} .collect();
// Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate // Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate
let is_approx_build_date_raw: Vec<bool> = if has_approx_col { let is_approx_build_date_raw: Vec<bool> = if has_approx_col {
@ -920,7 +919,7 @@ impl PropertyData {
let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1; let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1;
let mut perm: Vec<u32> = (0..row_count as u32).collect(); let mut perm: Vec<u32> = (0..row_count as u32).collect();
perm.sort_unstable_by_key(|&perm_index| { perm.par_sort_unstable_by_key(|&perm_index| {
let grid_row = ((lat[perm_index as usize] - min_lat_val) / grid_cell_size) as u64; let grid_row = ((lat[perm_index as usize] - min_lat_val) / grid_cell_size) as u64;
let grid_col = ((lon[perm_index as usize] - min_lon_val) / grid_cell_size) as u64; let grid_col = ((lon[perm_index as usize] - min_lon_val) / grid_cell_size) as u64;
grid_row * grid_cols + grid_col grid_row * grid_cols + grid_col
@ -1036,18 +1035,20 @@ impl PropertyData {
// Combines numeric and enum features into a single feature_data array. // Combines numeric and enum features into a single feature_data array.
tracing::info!("Transposing to row-major layout (spatially sorted)"); tracing::info!("Transposing to row-major layout (spatially sorted)");
let mut feature_data = vec![f32::NAN; row_count * num_features]; let mut feature_data = vec![f32::NAN; row_count * num_features];
for (new_row, &old_row) in perm.iter().enumerate() { feature_data
let old_index = old_row as usize; .par_chunks_mut(num_features)
let dst_base = new_row * num_features; .enumerate()
// Numeric features .for_each(|(new_row, row_slice)| {
for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() { let old_index = perm[new_row] as usize;
feature_data[dst_base + feat_idx] = col_vec[old_index]; // Numeric features
} for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
// Enum features (stored as f32 indices) row_slice[feat_idx] = col_vec[old_index];
for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() { }
feature_data[dst_base + num_numeric + enum_idx] = encoded[old_index]; // Enum features (stored as f32 indices)
} for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
} row_slice[num_numeric + enum_idx] = encoded[old_index];
}
});
tracing::info!("Data loading complete"); tracing::info!("Data loading complete");