This commit is contained in:
Andras Schmelczer 2026-05-12 06:44:54 +01:00
parent b580c51b6d
commit 7ca29c2d81
2 changed files with 58 additions and 62 deletions

View file

@ -133,12 +133,49 @@ pub struct PostcodeData {
pub centroids: Vec<(f32, f32)>,
/// Precomputed AABB per postcode: (south, west, north, east) as f32
pub aabbs: Vec<(f32, f32, f32, f32)>,
/// Precomputed GeoJSON geometry Value per postcode
pub geometries: Vec<serde_json::Value>,
/// Compact polygon storage: outer Vec is per-postcode, inner Vecs are rings of [lon, lat] f32 pairs.
/// Held as raw f32 to keep boundary memory ~10x smaller than serde_json::Value form.
pub polygons: Vec<Vec<Vec<[f32; 2]>>>,
/// Lookup from postcode string to index
pub postcode_to_idx: FxHashMap<String, usize>,
}
impl PostcodeData {
/// Build the GeoJSON Value for a postcode polygon on demand.
pub fn geometry_geojson(&self, idx: usize) -> serde_json::Value {
let rings = &self.polygons[idx];
if rings.len() == 1 {
let coords: Vec<serde_json::Value> = rings[0]
.iter()
.map(|[lon, lat]| {
serde_json::Value::Array(vec![
serde_json::Value::from(*lon as f64),
serde_json::Value::from(*lat as f64),
])
})
.collect();
serde_json::json!({"type": "Polygon", "coordinates": [coords]})
} else {
let polys: Vec<serde_json::Value> = rings
.iter()
.map(|ring| {
let coords: Vec<serde_json::Value> = ring
.iter()
.map(|[lon, lat]| {
serde_json::Value::Array(vec![
serde_json::Value::from(*lon as f64),
serde_json::Value::from(*lat as f64),
])
})
.collect();
serde_json::Value::Array(vec![serde_json::Value::Array(coords)])
})
.collect();
serde_json::json!({"type": "MultiPolygon", "coordinates": polys})
}
}
}
impl PostcodeData {
/// Load postcode boundaries from a directory of GeoJSON files.
/// Expects the directory to have a `units/` subdirectory containing .geojson files.
@ -295,49 +332,13 @@ impl PostcodeData {
postcode_to_idx.insert(postcode.clone(), idx);
}
// Precompute GeoJSON geometry for each postcode
let geometries: Vec<serde_json::Value> = polygons
.iter()
.map(|rings| {
if rings.len() == 1 {
let coords: Vec<serde_json::Value> = rings[0]
.iter()
.map(|[lon, lat]| {
serde_json::Value::Array(vec![
serde_json::Value::from(*lon as f64),
serde_json::Value::from(*lat as f64),
])
})
.collect();
serde_json::json!({"type": "Polygon", "coordinates": [coords]})
} else {
let polys: Vec<serde_json::Value> = rings
.iter()
.map(|ring| {
let coords: Vec<serde_json::Value> = ring
.iter()
.map(|[lon, lat]| {
serde_json::Value::Array(vec![
serde_json::Value::from(*lon as f64),
serde_json::Value::from(*lat as f64),
])
})
.collect();
serde_json::Value::Array(vec![serde_json::Value::Array(coords)])
})
.collect();
serde_json::json!({"type": "MultiPolygon", "coordinates": polys})
}
})
.collect();
info!(postcodes = postcodes.len(), "Postcode boundary data ready");
Ok(PostcodeData {
postcodes,
centroids,
aabbs,
geometries,
polygons,
postcode_to_idx,
})
}

View file

@ -1282,23 +1282,20 @@ impl PropertyData {
};
let mut poi_metrics = PostcodePoiMetrics::from_postcode_df(&postcode_df, poi_metric_names)?;
// Load properties.parquet and join with postcode data for lat/lon + area features
// Load properties.parquet and join with postcode data lazily so the
// wide combined frame is never fully materialized — projection is
// pushed down into the join, keeping peak memory bounded.
tracing::info!("Loading properties from {:?}", properties_path);
let properties_path = PlRefPath::try_from_path(properties_path)
.context("Failed to normalize properties parquet path")?;
let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default())
.context("Failed to scan properties parquet")?;
let combined = properties_lf
.join(
postcode_df.clone().lazy(),
[col("Postcode")],
[col("Postcode")],
JoinArgs::new(JoinType::Left),
)
.collect()
.context("Failed to join properties with postcodes")?;
let total_rows = combined.height();
tracing::info!(rows = total_rows, "Properties joined with postcodes");
let combined_lf = properties_lf.join(
postcode_df.lazy(),
[col("Postcode")],
[col("Postcode")],
JoinArgs::new(JoinType::Left),
);
// Get configured feature/enum names in config order. Dynamic POI
// metrics live in a postcode-level side table so they do not widen the
@ -1306,7 +1303,10 @@ impl PropertyData {
let configured_numeric_names = features::all_numeric_feature_names();
let enum_names = features::all_enum_feature_names();
let schema = combined.schema();
let schema = combined_lf
.clone()
.collect_schema()
.context("Failed to collect joined schema")?;
let numeric_names: Vec<String> = configured_numeric_names
.iter()
.map(|name| (*name).to_string())
@ -1402,24 +1402,16 @@ impl PropertyData {
if has_renovation_history {
select_exprs.push(col("renovation_history"));
}
let df = combined
.lazy()
let df = combined_lf
.filter(col("lat").is_not_null().and(col("lon").is_not_null()))
.select(select_exprs)
.collect()
.context("Failed to select columns from combined data")?;
.context("Failed to select columns from joined frame")?;
let row_count = df.height();
if row_count == 0 {
bail!("No property rows have usable coordinates after joining postcode data");
}
let dropped_coordinate_rows = total_rows.saturating_sub(row_count);
if dropped_coordinate_rows > 0 {
tracing::warn!(
rows = dropped_coordinate_rows,
"Dropped properties with missing postcode coordinates"
);
}
tracing::info!(rows = row_count, "Combined data selected");
let lat_series = df
@ -1692,6 +1684,9 @@ impl PropertyData {
FxHashMap::default()
};
// Free the projected joined frame before building the row-major matrix.
drop(df);
// Sort all rows by spatial locality so that grid queries access
// contiguous memory (sequential reads instead of random DRAM accesses).
tracing::info!("Sorting rows by spatial locality");