from pathlib import Path import polars as pl from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS def aggregate(df: pl.LazyFrame, resolution: int) -> pl.LazyFrame: """Aggregate property data by H3 cell and year.""" h3_col = f"h3_res{resolution}" return ( df.group_by(h3_col, "year") .agg( pl.len().alias("count"), pl.col("price").mean().alias("avg_price"), pl.col("price").median().alias("median_price"), pl.col("price").min().alias("min_price"), pl.col("price").max().alias("max_price"), ) .rename({h3_col: "h3"}) ) def aggregate_all(df: pl.LazyFrame) -> dict[int, pl.LazyFrame]: """Aggregate at all H3 resolutions.""" return {res: aggregate(df, res) for res in H3_RESOLUTIONS} def save_aggregates(df: pl.LazyFrame, output_dir: Path | None = None) -> list[Path]: """Aggregate and save at all H3 resolutions.""" output_dir = output_dir or AGGREGATES_DIR output_dir.mkdir(parents=True, exist_ok=True) saved_paths = [] aggregates = aggregate_all(df) for res, agg_df in aggregates.items(): output_path = output_dir / f"res{res}.parquet" agg_df.collect().write_parquet(output_path) saved_paths.append(output_path) return saved_paths