Improve perf
This commit is contained in:
parent
8032011708
commit
48f2c97487
1 changed files with 24 additions and 23 deletions
|
|
@ -656,7 +656,7 @@ impl PropertyData {
|
|||
|
||||
tracing::info!("Extracting numeric feature columns");
|
||||
let numeric_col_major: Vec<Vec<f32>> = numeric_names
|
||||
.iter()
|
||||
.par_iter()
|
||||
.map(|name| {
|
||||
let column = df
|
||||
.column(name)
|
||||
|
|
@ -733,12 +733,11 @@ impl PropertyData {
|
|||
|
||||
tracing::info!("Building enum features");
|
||||
// enum_col_major: Vec<(values_list, encoded_as_f32)>
|
||||
let mut enum_col_major: Vec<(Vec<String>, Vec<f32>)> = Vec::new();
|
||||
for &name in &enum_names {
|
||||
if let Ok(column_data) = df.column(name) {
|
||||
let string_column = column_data
|
||||
.str()
|
||||
.with_context(|| format!("Enum column '{name}' is not a string column"))?;
|
||||
let enum_col_major: Vec<(Vec<String>, Vec<f32>)> = enum_names
|
||||
.par_iter()
|
||||
.filter_map(|&name| {
|
||||
let column_data = df.column(name).ok()?;
|
||||
let string_column = column_data.str().ok()?;
|
||||
let unique_set: std::collections::HashSet<String> = string_column
|
||||
.into_iter()
|
||||
.filter_map(|value| {
|
||||
|
|
@ -795,9 +794,9 @@ impl PropertyData {
|
|||
.collect();
|
||||
|
||||
tracing::debug!(column = %name, unique_values = unique.len(), "Enum feature encoded as f32");
|
||||
enum_col_major.push((unique, encoded));
|
||||
}
|
||||
}
|
||||
Some((unique, encoded))
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Extract is_approx_build_date: 0.0 = exact, anything else (1.0/NaN) = approximate
|
||||
let is_approx_build_date_raw: Vec<bool> = if has_approx_col {
|
||||
|
|
@ -920,7 +919,7 @@ impl PropertyData {
|
|||
let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1;
|
||||
|
||||
let mut perm: Vec<u32> = (0..row_count as u32).collect();
|
||||
perm.sort_unstable_by_key(|&perm_index| {
|
||||
perm.par_sort_unstable_by_key(|&perm_index| {
|
||||
let grid_row = ((lat[perm_index as usize] - min_lat_val) / grid_cell_size) as u64;
|
||||
let grid_col = ((lon[perm_index as usize] - min_lon_val) / grid_cell_size) as u64;
|
||||
grid_row * grid_cols + grid_col
|
||||
|
|
@ -1036,18 +1035,20 @@ impl PropertyData {
|
|||
// Combines numeric and enum features into a single feature_data array.
|
||||
tracing::info!("Transposing to row-major layout (spatially sorted)");
|
||||
let mut feature_data = vec![f32::NAN; row_count * num_features];
|
||||
for (new_row, &old_row) in perm.iter().enumerate() {
|
||||
let old_index = old_row as usize;
|
||||
let dst_base = new_row * num_features;
|
||||
// Numeric features
|
||||
for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
|
||||
feature_data[dst_base + feat_idx] = col_vec[old_index];
|
||||
}
|
||||
// Enum features (stored as f32 indices)
|
||||
for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
|
||||
feature_data[dst_base + num_numeric + enum_idx] = encoded[old_index];
|
||||
}
|
||||
}
|
||||
feature_data
|
||||
.par_chunks_mut(num_features)
|
||||
.enumerate()
|
||||
.for_each(|(new_row, row_slice)| {
|
||||
let old_index = perm[new_row] as usize;
|
||||
// Numeric features
|
||||
for (feat_idx, col_vec) in numeric_col_major.iter().enumerate() {
|
||||
row_slice[feat_idx] = col_vec[old_index];
|
||||
}
|
||||
// Enum features (stored as f32 indices)
|
||||
for (enum_idx, (_, encoded)) in enum_col_major.iter().enumerate() {
|
||||
row_slice[num_numeric + enum_idx] = encoded[old_index];
|
||||
}
|
||||
});
|
||||
|
||||
tracing::info!("Data loading complete");
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue