perfect-postcode/pipeline/sources/postcodes.py

48 lines
1.4 KiB
Python

from pathlib import Path
import polars as pl
import h3
from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
"""Convert lat/long to H3 index at given resolution."""
return h3.latlng_to_cell(lat, long, resolution)
def load_postcodes() -> pl.LazyFrame:
"""Load postcode data from arcgis parquet file."""
return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
pl.col("pcds").alias("postcode"),
pl.col("lat"),
pl.col("long"),
)
def process_postcodes() -> pl.LazyFrame:
"""Process postcodes and add H3 indices at multiple resolutions."""
df = load_postcodes().collect()
for res in H3_RESOLUTIONS:
col_name = f"h3_res{res}"
df = df.with_columns(
pl.struct(["lat", "long"])
.map_elements(
lambda x: lat_long_to_h3(x["lat"], x["long"], res),
return_dtype=pl.Utf8,
)
.alias(col_name)
)
return df.lazy()
def save_postcodes(output_path: Path | None = None) -> Path:
"""Process and save postcodes with H3 indices."""
output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
output_path.parent.mkdir(parents=True, exist_ok=True)
df = process_postcodes().collect()
df.write_parquet(output_path)
return output_path