from pathlib import Path import polars as pl import h3 from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR def lat_long_to_h3(lat: float, long: float, resolution: int) -> str: """Convert lat/long to H3 index at given resolution.""" return h3.latlng_to_cell(lat, long, resolution) def load_postcodes() -> pl.LazyFrame: """Load postcode data from arcgis parquet file.""" return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select( pl.col("pcds").alias("postcode"), pl.col("lat"), pl.col("long"), ) def process_postcodes() -> pl.LazyFrame: """Process postcodes and add H3 indices at multiple resolutions.""" df = load_postcodes().collect() for res in H3_RESOLUTIONS: col_name = f"h3_res{res}" df = df.with_columns( pl.struct(["lat", "long"]) .map_elements( # Capture res by value using default argument to avoid closure bug lambda x, res=res: lat_long_to_h3(x["lat"], x["long"], res), return_dtype=pl.Utf8, ) .alias(col_name) ) return df.lazy() def save_postcodes(output_path: Path | None = None) -> Path: """Process and save postcodes with H3 indices.""" output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet" output_path.parent.mkdir(parents=True, exist_ok=True) df = process_postcodes().collect() df.write_parquet(output_path) return output_path