This commit is contained in:
Andras Schmelczer 2026-05-12 06:44:54 +01:00
parent b580c51b6d
commit 7ca29c2d81
2 changed files with 58 additions and 62 deletions

View file

@ -1282,23 +1282,20 @@ impl PropertyData {
};
let mut poi_metrics = PostcodePoiMetrics::from_postcode_df(&postcode_df, poi_metric_names)?;
// Load properties.parquet and join with postcode data for lat/lon + area features
// Load properties.parquet and join with postcode data lazily so the
// wide combined frame is never fully materialized — projection is
// pushed down into the join, keeping peak memory bounded.
tracing::info!("Loading properties from {:?}", properties_path);
let properties_path = PlRefPath::try_from_path(properties_path)
.context("Failed to normalize properties parquet path")?;
let properties_lf = LazyFrame::scan_parquet(properties_path, Default::default())
.context("Failed to scan properties parquet")?;
let combined = properties_lf
.join(
postcode_df.clone().lazy(),
[col("Postcode")],
[col("Postcode")],
JoinArgs::new(JoinType::Left),
)
.collect()
.context("Failed to join properties with postcodes")?;
let total_rows = combined.height();
tracing::info!(rows = total_rows, "Properties joined with postcodes");
let combined_lf = properties_lf.join(
postcode_df.lazy(),
[col("Postcode")],
[col("Postcode")],
JoinArgs::new(JoinType::Left),
);
// Get configured feature/enum names in config order. Dynamic POI
// metrics live in a postcode-level side table so they do not widen the
@ -1306,7 +1303,10 @@ impl PropertyData {
let configured_numeric_names = features::all_numeric_feature_names();
let enum_names = features::all_enum_feature_names();
let schema = combined.schema();
let schema = combined_lf
.clone()
.collect_schema()
.context("Failed to collect joined schema")?;
let numeric_names: Vec<String> = configured_numeric_names
.iter()
.map(|name| (*name).to_string())
@ -1402,24 +1402,16 @@ impl PropertyData {
if has_renovation_history {
select_exprs.push(col("renovation_history"));
}
let df = combined
.lazy()
let df = combined_lf
.filter(col("lat").is_not_null().and(col("lon").is_not_null()))
.select(select_exprs)
.collect()
.context("Failed to select columns from combined data")?;
.context("Failed to select columns from joined frame")?;
let row_count = df.height();
if row_count == 0 {
bail!("No property rows have usable coordinates after joining postcode data");
}
let dropped_coordinate_rows = total_rows.saturating_sub(row_count);
if dropped_coordinate_rows > 0 {
tracing::warn!(
rows = dropped_coordinate_rows,
"Dropped properties with missing postcode coordinates"
);
}
tracing::info!(rows = row_count, "Combined data selected");
let lat_series = df
@ -1692,6 +1684,9 @@ impl PropertyData {
FxHashMap::default()
};
// Free the projected joined frame before building the row-major matrix.
drop(df);
// Sort all rows by spatial locality so that grid queries access
// contiguous memory (sequential reads instead of random DRAM accesses).
tracing::info!("Sorting rows by spatial locality");