Rewrite server in rust

2026-01-31 10:18:54 +00:00 · 2026-01-31 10:18:54 +00:00 · bf2d5de156
commit bf2d5de156
parent 0cea9b873c
13 changed files with 3875 additions and 547 deletions
--- a/server-rs/src/data.rs
+++ b/server-rs/src/data.rs
@ -0,0 +1,405 @@
+use polars::prelude::*;
+use polars::lazy::frame::LazyFrame;
+use rayon::prelude::*;
+use serde::Serialize;
+use std::path::Path;
+
+use crate::consts::{FEATURE_PERCENTILE_LOW, FEATURE_PERCENTILE_HIGH, HISTOGRAM_BINS, H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX};
+
+/// Columns to exclude from feature discovery (not numeric features)
+const EXCLUDED_COLUMNS: &[&str] = &["lat", "lon"];
+
+/// H3 valid resolution range (0-15)
+pub const MIN_RESOLUTION: u8 = 0;
+pub const MAX_RESOLUTION: u8 = 15;
+pub const DEFAULT_RESOLUTION: u8 = 8;
+
+/// Returns true if the polars DataType is numeric (integer or float)
+fn is_numeric_dtype(dtype: &DataType) -> bool {
+    matches!(
+        dtype,
+        DataType::Int8
+            | DataType::Int16
+            | DataType::Int32
+            | DataType::Int64
+            | DataType::UInt8
+            | DataType::UInt16
+            | DataType::UInt32
+            | DataType::UInt64
+            | DataType::Float32
+            | DataType::Float64
+    )
+}
+
+/// Histogram for a single feature column
+#[derive(Serialize, Clone)]
+pub struct Histogram {
+    /// Left edge of first bin
+    pub min: f64,
+    /// Right edge of last bin
+    pub max: f64,
+    /// Width of each bin
+    pub bin_width: f64,
+    /// Count of values in each bin
+    pub counts: Vec<u64>,
+}
+
+/// Precomputed statistics for a single feature
+pub struct FeatureStats {
+    pub p_low: f64,
+    pub p_high: f64,
+    pub histogram: Histogram,
+}
+
+/// Columnar storage for all property data.
+/// Feature values use NaN as the null sentinel.
+pub struct PropertyData {
+    pub lat: Vec<f64>,
+    pub lon: Vec<f64>,
+    /// Dynamically discovered numeric feature column names
+    pub feature_names: Vec<String>,
+    /// Number of feature columns
+    pub num_features: usize,
+    /// Row-major flat array: feature_data[row * num_features + feat_idx].
+    /// NaN = null. Contiguous layout for cache-friendly per-row access.
+    pub feature_data: Vec<f64>,
+    /// Precomputed stats (percentiles + histogram) for each feature
+    pub feature_stats: Vec<FeatureStats>,
+}
+
+/// Approximate a percentile from a histogram using linear interpolation.
+/// `p` is in [0, 100]. `total` is the sum of all bin counts.
+fn percentile_from_histogram(counts: &[u64], min: f64, bin_width: f64, total: usize, p: f64) -> f64 {
+    let target = (p / 100.0) * (total as f64 - 1.0);
+    let mut cumulative = 0u64;
+    for (i, &c) in counts.iter().enumerate() {
+        let prev = cumulative;
+        cumulative += c;
+        if cumulative as f64 > target {
+            // Interpolate within this bin
+            let frac = if c > 0 {
+                (target - prev as f64) / c as f64
+            } else {
+                0.0
+            };
+            return min + (i as f64 + frac) * bin_width;
+        }
+    }
+    // Fallback: right edge of last bin
+    min + counts.len() as f64 * bin_width
+}
+
+/// Build a histogram and compute approximate percentiles in O(n) — no sort needed.
+fn compute_feature_stats(vals: &[f64]) -> FeatureStats {
+    // Single pass: min, max, count (skipping NaN)
+    let mut min = f64::INFINITY;
+    let mut max = f64::NEG_INFINITY;
+    let mut count = 0usize;
+    for &v in vals {
+        if !v.is_nan() {
+            if v < min { min = v; }
+            if v > max { max = v; }
+            count += 1;
+        }
+    }
+
+    if count == 0 {
+        return FeatureStats {
+            p_low: 0.0,
+            p_high: 0.0,
+            histogram: Histogram {
+                min: 0.0,
+                max: 0.0,
+                bin_width: 1.0,
+                counts: vec![0; HISTOGRAM_BINS],
+            },
+        };
+    }
+
+    // Build histogram over full range (second pass, no sort)
+    let range = if max == min { 1.0 } else { max - min };
+    let bin_max = min + range * (1.0 + 1e-9);
+    let bin_width = (bin_max - min) / HISTOGRAM_BINS as f64;
+
+    let mut counts = vec![0u64; HISTOGRAM_BINS];
+    for &v in vals {
+        if !v.is_nan() {
+            let bin = ((v - min) / bin_width) as usize;
+            counts[bin.min(HISTOGRAM_BINS - 1)] += 1;
+        }
+    }
+
+    // Approximate percentiles from the histogram
+    let p_low = percentile_from_histogram(&counts, min, bin_width, count, FEATURE_PERCENTILE_LOW);
+    let p_high = percentile_from_histogram(&counts, min, bin_width, count, FEATURE_PERCENTILE_HIGH);
+
+    FeatureStats {
+        p_low,
+        p_high,
+        histogram: Histogram {
+            min,
+            max,
+            bin_width,
+            counts,
+        },
+    }
+}
+
+/// Convert a polars Column to Vec<f64> using NaN for null values
+fn column_to_f64_vec(c: &Column) -> Vec<f64> {
+    let s = c.cast(&DataType::Float64).unwrap();
+    let ca = s.f64().unwrap();
+    ca.into_iter().map(|v| v.unwrap_or(f64::NAN)).collect()
+}
+
+/// Precompute H3 cell IDs for all rows at commonly used resolutions.
+/// Returns a Vec indexed by resolution (0..16), where non-precomputed
+/// resolutions have an empty Vec.
+pub fn precompute_h3(lat: &[f64], lon: &[f64]) -> Vec<Vec<u64>> {
+    eprintln!(
+        "Precomputing H3 cells for resolutions {}..{}...",
+        H3_PRECOMPUTE_MIN, H3_PRECOMPUTE_MAX
+    );
+
+    let resolutions: Vec<u8> = (H3_PRECOMPUTE_MIN..=H3_PRECOMPUTE_MAX).collect();
+    let computed: Vec<(u8, Vec<u64>)> = resolutions
+        .into_par_iter()
+        .map(|res| {
+            let h3_res = h3o::Resolution::try_from(res).unwrap();
+            let cells: Vec<u64> = lat
+                .iter()
+                .zip(lon.iter())
+                .map(|(&la, &lo)| {
+                    h3o::LatLng::new(la, lo)
+                        .map(|c| u64::from(c.to_cell(h3_res)))
+                        .unwrap_or(0)
+                })
+                .collect();
+            eprintln!("  Resolution {} done ({} cells)", res, cells.len());
+            (res, cells)
+        })
+        .collect();
+
+    let mut result: Vec<Vec<u64>> = (0..16).map(|_| Vec::new()).collect();
+    for (res, cells) in computed {
+        result[res as usize] = cells;
+    }
+
+    eprintln!("H3 precomputation complete.");
+    result
+}
+
+impl PropertyData {
+    pub fn load(parquet_path: &Path) -> Self {
+        eprintln!("Loading parquet from {:?}...", parquet_path);
+
+        // Scan schema to discover numeric feature columns
+        let mut lf = LazyFrame::scan_parquet(parquet_path, Default::default())
+            .expect("Failed to scan parquet");
+        let schema = lf.collect_schema().expect("Failed to read schema");
+
+        let feature_names: Vec<String> = schema
+            .iter()
+            .filter(|(name, dtype)| {
+                is_numeric_dtype(dtype) && !EXCLUDED_COLUMNS.contains(&name.as_str())
+            })
+            .map(|(name, _)| name.to_string())
+            .collect();
+
+        let num_features = feature_names.len();
+        eprintln!("Discovered {} numeric feature columns", num_features);
+
+        // Read only the columns we need
+        let mut cols_needed: Vec<String> = vec!["lat".into(), "lon".into()];
+        cols_needed.extend(feature_names.iter().cloned());
+
+        let df = LazyFrame::scan_parquet(parquet_path, Default::default())
+            .expect("Failed to scan parquet")
+            .select(
+                cols_needed
+                    .iter()
+                    .map(|c| col(c.as_str()).cast(DataType::Float64))
+                    .collect::<Vec<_>>(),
+            )
+            .collect()
+            .expect("Failed to read parquet");
+
+        let row_count = df.height();
+        eprintln!("Loaded {} rows", row_count);
+
+        // Extract lat/lon using bulk iterator
+        let lat_series = df.column("lat").unwrap().cast(&DataType::Float64).unwrap();
+        let lat: Vec<f64> = lat_series.f64().unwrap().into_iter().map(|v| v.unwrap_or(0.0)).collect();
+
+        let lon_series = df.column("lon").unwrap().cast(&DataType::Float64).unwrap();
+        let lon: Vec<f64> = lon_series.f64().unwrap().into_iter().map(|v| v.unwrap_or(0.0)).collect();
+
+        // Extract feature columns (column-major, for cache-friendly histogram computation)
+        eprintln!("Extracting feature columns...");
+        let col_major: Vec<Vec<f64>> = feature_names
+            .iter()
+            .map(|name| {
+                let s = df.column(name.as_str()).unwrap();
+                column_to_f64_vec(s)
+            })
+            .collect();
+
+        // Compute histograms in parallel (column-major is ideal for per-column iteration)
+        eprintln!("Computing histograms...");
+        let feature_stats: Vec<FeatureStats> = col_major
+            .par_iter()
+            .enumerate()
+            .map(|(i, vals)| {
+                let stats = compute_feature_stats(vals);
+                eprintln!(
+                    "  {}: p{}={:.2}, p{}={:.2}, {} bins",
+                    feature_names[i],
+                    FEATURE_PERCENTILE_LOW, stats.p_low,
+                    FEATURE_PERCENTILE_HIGH, stats.p_high,
+                    stats.histogram.counts.len()
+                );
+                stats
+            })
+            .collect();
+
+        // Sort all rows by spatial locality so that grid queries access
+        // contiguous memory (sequential reads instead of random DRAM accesses).
+        // Uses the same 0.01° grid cell as the spatial index for the sort key.
+        eprintln!("Sorting rows by spatial locality...");
+        let grid_cell_size = 0.01_f64;
+        let min_lat_val = lat.iter().cloned().fold(f64::INFINITY, f64::min) - grid_cell_size;
+        let min_lon_val = lon.iter().cloned().fold(f64::INFINITY, f64::min) - grid_cell_size;
+        let max_lon_val = lon.iter().cloned().fold(f64::NEG_INFINITY, f64::max) + grid_cell_size;
+        let grid_cols = ((max_lon_val - min_lon_val) / grid_cell_size).ceil() as u64 + 1;
+
+        let mut perm: Vec<u32> = (0..row_count as u32).collect();
+        perm.sort_unstable_by_key(|&i| {
+            let r = ((lat[i as usize] - min_lat_val) / grid_cell_size) as u64;
+            let c = ((lon[i as usize] - min_lon_val) / grid_cell_size) as u64;
+            r * grid_cols + c
+        });
+
+        // Apply permutation to lat/lon
+        let lat: Vec<f64> = perm.iter().map(|&i| lat[i as usize]).collect();
+        let lon: Vec<f64> = perm.iter().map(|&i| lon[i as usize]).collect();
+
+        // Transpose to row-major AND apply spatial permutation in one pass.
+        // Result: all features for one row are contiguous, and spatially
+        // nearby rows are adjacent in memory.
+        eprintln!("Transposing to row-major layout (spatially sorted)...");
+        let mut feature_data = vec![f64::NAN; row_count * num_features];
+        for (new_row, &old_row) in perm.iter().enumerate() {
+            let old = old_row as usize;
+            let dst_base = new_row * num_features;
+            for (feat_idx, col_vec) in col_major.iter().enumerate() {
+                feature_data[dst_base + feat_idx] = col_vec[old];
+            }
+        }
+
+        eprintln!("Data loading complete.");
+
+        PropertyData {
+            lat,
+            lon,
+            feature_names,
+            num_features,
+            feature_data,
+            feature_stats,
+        }
+    }
+}
+
+/// Point of Interest data
+#[derive(Serialize)]
+pub struct POI {
+    pub id: String,
+    pub name: String,
+    pub category: String,
+    pub lat: f64,
+    pub lng: f64,
+    pub emoji: String,
+}
+
+/// Columnar storage for POI data
+pub struct POIData {
+    pub id: Vec<String>,
+    pub name: Vec<String>,
+    pub category: Vec<String>,
+    pub lat: Vec<f64>,
+    pub lng: Vec<f64>,
+    pub emoji: Vec<String>,
+}
+
+impl POIData {
+    pub fn load(parquet_path: &Path) -> Self {
+        eprintln!("Loading POI data from {:?}...", parquet_path);
+
+        let df = LazyFrame::scan_parquet(parquet_path, Default::default())
+            .expect("Failed to scan POI parquet")
+            .collect()
+            .expect("Failed to read POI parquet");
+
+        let row_count = df.height();
+        eprintln!("Loaded {} POIs", row_count);
+
+        // Extract columns
+        let id: Vec<String> = df.column("id")
+            .unwrap()
+            .str()
+            .unwrap()
+            .into_iter()
+            .map(|v| v.unwrap_or("").to_string())
+            .collect();
+
+        let name: Vec<String> = df.column("name")
+            .unwrap()
+            .str()
+            .unwrap()
+            .into_iter()
+            .map(|v| v.unwrap_or("").to_string())
+            .collect();
+
+        let category: Vec<String> = df.column("category")
+            .unwrap()
+            .str()
+            .unwrap()
+            .into_iter()
+            .map(|v| v.unwrap_or("").to_string())
+            .collect();
+
+        let lat: Vec<f64> = df.column("lat")
+            .unwrap()
+            .f64()
+            .unwrap()
+            .into_iter()
+            .map(|v| v.unwrap_or(0.0))
+            .collect();
+
+        let lng: Vec<f64> = df.column("lng")
+            .unwrap()
+            .f64()
+            .unwrap()
+            .into_iter()
+            .map(|v| v.unwrap_or(0.0))
+            .collect();
+
+        let emoji: Vec<String> = df.column("emoji")
+            .unwrap()
+            .str()
+            .unwrap()
+            .into_iter()
+            .map(|v| v.unwrap_or("").to_string())
+            .collect();
+
+        eprintln!("POI data loading complete.");
+
+        POIData {
+            id,
+            name,
+            category,
+            lat,
+            lng,
+            emoji,
+        }
+    }
+}