Refactor and other improvements
This commit is contained in:
parent
04a78e7bfe
commit
6c90cf3c0f
47 changed files with 2705 additions and 1568 deletions
163
server-rs/src/routes/stats.rs
Normal file
163
server-rs/src/routes/stats.rs
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::consts::MAX_PRICE_HISTORY_POINTS;
|
||||
use crate::data::FeatureStats;
|
||||
|
||||
use super::hexagon_stats::{EnumFeatureStats, HistogramStats, NumericFeatureStats, PricePoint};
|
||||
|
||||
/// Extract price history (year, price) pairs from matching rows, downsampled if needed.
|
||||
pub fn extract_price_history(
|
||||
matching_rows: &[usize],
|
||||
feature_data: &[f32],
|
||||
num_features: usize,
|
||||
feature_name_to_index: &FxHashMap<String, usize>,
|
||||
) -> Vec<PricePoint> {
|
||||
let year_idx = feature_name_to_index
|
||||
.get("Date of last transaction")
|
||||
.copied();
|
||||
let price_idx = feature_name_to_index.get("Last known price").copied();
|
||||
match (year_idx, price_idx) {
|
||||
(Some(yi), Some(pi)) => {
|
||||
let mut points: Vec<PricePoint> = matching_rows
|
||||
.iter()
|
||||
.filter_map(|&row| {
|
||||
let year = feature_data[row * num_features + yi];
|
||||
let price = feature_data[row * num_features + pi];
|
||||
if year.is_finite() && price.is_finite() {
|
||||
Some(PricePoint { year, price })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
if points.len() > MAX_PRICE_HISTORY_POINTS {
|
||||
let step = points.len() as f64 / MAX_PRICE_HISTORY_POINTS as f64;
|
||||
points = (0..MAX_PRICE_HISTORY_POINTS)
|
||||
.map(|i| {
|
||||
let idx = (i as f64 * step) as usize;
|
||||
PricePoint {
|
||||
year: points[idx].year,
|
||||
price: points[idx].price,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
}
|
||||
points
|
||||
}
|
||||
_ => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute per-feature stats (numeric histograms + enum counts) for the given rows.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn compute_feature_stats(
|
||||
matching_rows: &[usize],
|
||||
feature_data: &[f32],
|
||||
feature_names: &[String],
|
||||
num_features: usize,
|
||||
enum_values: &FxHashMap<usize, Vec<String>>,
|
||||
feature_stats_data: &[FeatureStats],
|
||||
fields_specified: bool,
|
||||
field_set: &HashSet<String>,
|
||||
) -> (Vec<NumericFeatureStats>, Vec<EnumFeatureStats>) {
|
||||
let mut numeric_features = Vec::new();
|
||||
let mut enum_features_out = Vec::new();
|
||||
|
||||
for (feature_index, feature_name) in feature_names.iter().enumerate() {
|
||||
if fields_specified && !field_set.contains(feature_name.as_str()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(ev) = enum_values.get(&feature_index) {
|
||||
let mut value_counts = vec![0u64; ev.len()];
|
||||
for &row in matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
let idx = value as usize;
|
||||
if idx < value_counts.len() {
|
||||
value_counts[idx] += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let counts: HashMap<String, u64> = value_counts
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, &count)| count > 0)
|
||||
.map(|(idx, &count)| (ev[idx].clone(), count))
|
||||
.collect();
|
||||
|
||||
if !counts.is_empty() {
|
||||
enum_features_out.push(EnumFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
counts,
|
||||
});
|
||||
}
|
||||
} else {
|
||||
let global_hist = &feature_stats_data[feature_index].histogram;
|
||||
let p1 = global_hist.p1;
|
||||
let p99 = global_hist.p99;
|
||||
let num_bins = global_hist.counts.len();
|
||||
|
||||
let mut count = 0usize;
|
||||
let mut min_value = f32::INFINITY;
|
||||
let mut max_value = f32::NEG_INFINITY;
|
||||
let mut sum = 0.0f64;
|
||||
let mut bins = vec![0u64; num_bins];
|
||||
|
||||
let middle_bins = num_bins.saturating_sub(2);
|
||||
let middle_width = if middle_bins > 0 && p99 > p1 {
|
||||
(p99 - p1) / middle_bins as f32
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
for &row in matching_rows {
|
||||
let value = feature_data[row * num_features + feature_index];
|
||||
if value.is_finite() {
|
||||
count += 1;
|
||||
if value < min_value {
|
||||
min_value = value;
|
||||
}
|
||||
if value > max_value {
|
||||
max_value = value;
|
||||
}
|
||||
sum += value as f64;
|
||||
|
||||
let bin = if value < p1 {
|
||||
0
|
||||
} else if value >= p99 {
|
||||
num_bins - 1
|
||||
} else if middle_width > 0.0 {
|
||||
let middle_bin = ((value - p1) / middle_width) as usize;
|
||||
(1 + middle_bin).min(num_bins - 2)
|
||||
} else {
|
||||
num_bins / 2
|
||||
};
|
||||
bins[bin] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
numeric_features.push(NumericFeatureStats {
|
||||
name: feature_name.clone(),
|
||||
count,
|
||||
min: min_value as f64,
|
||||
max: max_value as f64,
|
||||
mean: sum / count as f64,
|
||||
histogram: HistogramStats {
|
||||
min: global_hist.min as f64,
|
||||
max: global_hist.max as f64,
|
||||
p1: p1 as f64,
|
||||
p99: p99 as f64,
|
||||
counts: bins,
|
||||
},
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
(numeric_features, enum_features_out)
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue