Fix data loading

This commit is contained in:
Andras Schmelczer 2026-02-01 13:09:58 +00:00
parent 7235df0a97
commit c84af213e2
2 changed files with 65 additions and 15 deletions

View file

@ -190,7 +190,8 @@ def sample_noise_at_postcodes(
easting: np.ndarray,
northing: np.ndarray,
label: str,
) -> np.ndarray:
col_name: str,
) -> pl.Series:
"""Sample noise values from merged tiles at given BNG coordinates."""
print(f"[{label}] Merging {len(tile_paths)} tiles...")
datasets = [rasterio.open(p) for p in tile_paths]
@ -223,7 +224,10 @@ def sample_noise_at_postcodes(
valid_count = int(np.sum(~np.isnan(noise_db)))
print(f"[{label}] Sampled {valid_count:,} / {len(easting):,} postcodes with noise data")
return noise_db
# Return as masked Series: use null (not NaN) so that Polars max_horizontal
# correctly ignores missing values instead of propagating NaN.
return pl.Series(col_name, noise_db).fill_nan(None)
def main() -> None:
@ -264,11 +268,11 @@ def main() -> None:
if not tile_paths:
print(f"[{label}] WARNING: No tiles downloaded — column will be all null")
noise_db = np.full(len(lat), np.nan, dtype=np.float32)
series = pl.Series(col_name, [None] * len(lat), dtype=pl.Float32)
else:
noise_db = sample_noise_at_postcodes(tile_paths, easting, northing, label)
series = sample_noise_at_postcodes(tile_paths, easting, northing, label, col_name)
result = result.with_columns(pl.Series(col_name, noise_db))
result = result.with_columns(series)
result.write_parquet(args.output, compression="zstd")
size_mb = args.output.stat().st_size / (1024 * 1024)