Improve perf

This commit is contained in:
Andras Schmelczer 2026-02-22 23:14:36 +00:00
parent 8032011708
commit 48f2c97487

View file

@ -656,7 +656,7 @@ impl PropertyData {
tracing::info!("Extracting numeric feature columns");
let numeric_col_major: Vec<Vec<f32>> = numeric_names
.iter()
.par_iter()
.map(|name| {
let column = df
.column(name)
@ -733,12 +733,11 @@ impl PropertyData {
tracing::info!("Building enum features");
// enum_col_major: Vec<(values_list, encoded_as_f32)>
let mut enum_col_major: Vec<(Vec<String>, Vec<f32>)> = Vec::new();
for &name in &enum_names {
if let Ok(column_data) = df.column(name) {
let string_column = column_data
.str()
.with_context(|| format!("Enum column '{name}' is not a string column"))?;
let enum_col_major: Vec<(Vec<String>, Vec<f32>)> = enum_names
.par_iter()
.filter_map(|&name| {
let column_data = df.column(name).ok()?;
let string_column = column_data.str().ok()?;
let unique_set: std::collections::HashSet<String> = string_column
.into_iter()
.filter_map(|value| {
@ -795,9 +794,9 @@ impl PropertyData {
.collect();
tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32");
enum_col_major.push((unique, encoded));
}
}
Some((unique, encoded))
})
.collect();
// Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate
let is_approx_build_date_raw: Vec<bool> = if has_approx_col {
@ -920,7 +919,7 @@ impl PropertyData {
let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1;
let mut perm: Vec<u32> = (0..row_count as u32).collect();
perm.sort_unstable_by_key(|&perm_index| {
perm.par_sort_unstable_by_key(|&perm_index| {
let grid_row = ((lat[perm_index as usize] - min_lat_val) / grid_cell_size) as u64;
let grid_col = ((lon[perm_index as usize] - min_lon_val) / grid_cell_size) as u64;
grid_row * grid_cols + grid_col
@ -1036,18 +1035,20 @@ impl PropertyData {
// Combines numeric and enum features into a single feature_data array.
tracing::info!("Transposing to row-major layout (spatially sorted)");
let mut feature_data = vec![f32::NAN; row_count * num_features];
for (new_row, &old_row) in perm.iter().enumerate() {
let old_index = old_row as usize;
let dst_base = new_row * num_features;
// Numeric features
for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
feature_data[dst_base + feat_idx] = col_vec[old_index];
}
// Enum features (stored as f32 indices)
for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
feature_data[dst_base + num_numeric + enum_idx] = encoded[old_index];
}
}
feature_data
.par_chunks_mut(num_features)
.enumerate()
.for_each(|(new_row, row_slice)| {
let old_index = perm[new_row] as usize;
// Numeric features
for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
row_slice[feat_idx] = col_vec[old_index];
}
// Enum features (stored as f32 indices)
for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
row_slice[num_numeric + enum_idx] = encoded[old_index];
}
});
tracing::info!("Data loading complete");