Fix OOMs
This commit is contained in:
parent
b580c51b6d
commit
7ca29c2d81
2 changed files with 58 additions and 62 deletions
|
|
@ -133,12 +133,49 @@ pub struct PostcodeData {
|
|||
pub centroids: Vec<(f32, f32)>,
|
||||
/// Precomputed AABB per postcode: (south, west, north, east) as f32
|
||||
pub aabbs: Vec<(f32, f32, f32, f32)>,
|
||||
/// Precomputed GeoJSON geometry Value per postcode
|
||||
pub geometries: Vec<serde_json::Value>,
|
||||
/// Compact polygon storage: outer Vec is per-postcode, inner Vecs are rings of [lon, lat] f32 pairs.
|
||||
/// Held as raw f32 to keep boundary memory ~10x smaller than serde_json::Value form.
|
||||
pub polygons: Vec<Vec<Vec<[f32; 2]>>>,
|
||||
/// Lookup from postcode string to index
|
||||
pub postcode_to_idx: FxHashMap<String, usize>,
|
||||
}
|
||||
|
||||
impl PostcodeData {
|
||||
/// Build the GeoJSON Value for a postcode polygon on demand.
|
||||
pub fn geometry_geojson(&self, idx: usize) -> serde_json::Value {
|
||||
let rings = &self.polygons[idx];
|
||||
if rings.len() == 1 {
|
||||
let coords: Vec<serde_json::Value> = rings[0]
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
serde_json::Value::Array(vec![
|
||||
serde_json::Value::from(*lon as f64),
|
||||
serde_json::Value::from(*lat as f64),
|
||||
])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({"type": "Polygon", "coordinates": [coords]})
|
||||
} else {
|
||||
let polys: Vec<serde_json::Value> = rings
|
||||
.iter()
|
||||
.map(|ring| {
|
||||
let coords: Vec<serde_json::Value> = ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
serde_json::Value::Array(vec![
|
||||
serde_json::Value::from(*lon as f64),
|
||||
serde_json::Value::from(*lat as f64),
|
||||
])
|
||||
})
|
||||
.collect();
|
||||
serde_json::Value::Array(vec![serde_json::Value::Array(coords)])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({"type": "MultiPolygon", "coordinates": polys})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PostcodeData {
|
||||
/// Load postcode boundaries from a directory of GeoJSON files.
|
||||
/// Expects the directory to have a `units/` subdirectory containing .geojson files.
|
||||
|
|
@ -295,49 +332,13 @@ impl PostcodeData {
|
|||
postcode_to_idx.insert(postcode.clone(), idx);
|
||||
}
|
||||
|
||||
// Precompute GeoJSON geometry for each postcode
|
||||
let geometries: Vec<serde_json::Value> = polygons
|
||||
.iter()
|
||||
.map(|rings| {
|
||||
if rings.len() == 1 {
|
||||
let coords: Vec<serde_json::Value> = rings[0]
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
serde_json::Value::Array(vec![
|
||||
serde_json::Value::from(*lon as f64),
|
||||
serde_json::Value::from(*lat as f64),
|
||||
])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({"type": "Polygon", "coordinates": [coords]})
|
||||
} else {
|
||||
let polys: Vec<serde_json::Value> = rings
|
||||
.iter()
|
||||
.map(|ring| {
|
||||
let coords: Vec<serde_json::Value> = ring
|
||||
.iter()
|
||||
.map(|[lon, lat]| {
|
||||
serde_json::Value::Array(vec![
|
||||
serde_json::Value::from(*lon as f64),
|
||||
serde_json::Value::from(*lat as f64),
|
||||
])
|
||||
})
|
||||
.collect();
|
||||
serde_json::Value::Array(vec![serde_json::Value::Array(coords)])
|
||||
})
|
||||
.collect();
|
||||
serde_json::json!({"type": "MultiPolygon", "coordinates": polys})
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
info!(postcodes = postcodes.len(), "Postcode boundary data ready");
|
||||
|
||||
Ok(PostcodeData {
|
||||
postcodes,
|
||||
centroids,
|
||||
aabbs,
|
||||
geometries,
|
||||
polygons,
|
||||
postcode_to_idx,
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1282,23 +1282,20 @@ impl PropertyData {
|
|||
};
|
||||
let mut poi_metrics = PostcodePoiMetrics::from_postcode_df(&postcode_df, poi_metric_names)?;
|
||||
|
||||
// Load properties.parquet and join with postcode data for lat/lon + area features
|
||||
// Load properties.parquet and join with postcode data lazily so the
|
||||
// wide combined frame is never fully materialized — projection is
|
||||
// pushed down into the join, keeping peak memory bounded.
|
||||
tracing::info!("Loading properties from {:?}", properties_path);
|
||||
let properties_path = PlRefPath::try_from_path(properties_path)
|
||||
.context("Failed to normalize properties parquet path")?;
|
||||
let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default())
|
||||
.context("Failed to scan properties parquet")?;
|
||||
let combined = properties_lf
|
||||
.join(
|
||||
postcode_df.clone().lazy(),
|
||||
[col("Postcode")],
|
||||
[col("Postcode")],
|
||||
JoinArgs::new(JoinType::Left),
|
||||
)
|
||||
.collect()
|
||||
.context("Failed to join properties with postcodes")?;
|
||||
let total_rows = combined.height();
|
||||
tracing::info!(rows = total_rows, "Properties joined with postcodes");
|
||||
let combined_lf = properties_lf.join(
|
||||
postcode_df.lazy(),
|
||||
[col("Postcode")],
|
||||
[col("Postcode")],
|
||||
JoinArgs::new(JoinType::Left),
|
||||
);
|
||||
|
||||
// Get configured feature/enum names in config order. Dynamic POI
|
||||
// metrics live in a postcode-level side table so they do not widen the
|
||||
|
|
@ -1306,7 +1303,10 @@ impl PropertyData {
|
|||
let configured_numeric_names = features::all_numeric_feature_names();
|
||||
let enum_names = features::all_enum_feature_names();
|
||||
|
||||
let schema = combined.schema();
|
||||
let schema = combined_lf
|
||||
.clone()
|
||||
.collect_schema()
|
||||
.context("Failed to collect joined schema")?;
|
||||
let numeric_names: Vec<String> = configured_numeric_names
|
||||
.iter()
|
||||
.map(|name| (*name).to_string())
|
||||
|
|
@ -1402,24 +1402,16 @@ impl PropertyData {
|
|||
if has_renovation_history {
|
||||
select_exprs.push(col("renovation_history"));
|
||||
}
|
||||
let df = combined
|
||||
.lazy()
|
||||
let df = combined_lf
|
||||
.filter(col("lat").is_not_null().and(col("lon").is_not_null()))
|
||||
.select(select_exprs)
|
||||
.collect()
|
||||
.context("Failed to select columns from combined data")?;
|
||||
.context("Failed to select columns from joined frame")?;
|
||||
|
||||
let row_count = df.height();
|
||||
if row_count == 0 {
|
||||
bail!("No property rows have usable coordinates after joining postcode data");
|
||||
}
|
||||
let dropped_coordinate_rows = total_rows.saturating_sub(row_count);
|
||||
if dropped_coordinate_rows > 0 {
|
||||
tracing::warn!(
|
||||
rows = dropped_coordinate_rows,
|
||||
"Dropped properties with missing postcode coordinates"
|
||||
);
|
||||
}
|
||||
tracing::info!(rows = row_count, "Combined data selected");
|
||||
|
||||
let lat_series = df
|
||||
|
|
@ -1692,6 +1684,9 @@ impl PropertyData {
|
|||
FxHashMap::default()
|
||||
};
|
||||
|
||||
// Free the projected joined frame before building the row-major matrix.
|
||||
drop(df);
|
||||
|
||||
// Sort all rows by spatial locality so that grid queries access
|
||||
// contiguous memory (sequential reads instead of random DRAM accesses).
|
||||
tracing::info!("Sorting rows by spatial locality");
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue