48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
from pathlib import Path
|
|
import polars as pl
|
|
import h3
|
|
|
|
from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
|
|
|
|
|
|
def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
|
|
"""Convert lat/long to H3 index at given resolution."""
|
|
return h3.latlng_to_cell(lat, long, resolution)
|
|
|
|
|
|
def load_postcodes() -> pl.LazyFrame:
|
|
"""Load postcode data from arcgis parquet file."""
|
|
return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
|
|
pl.col("pcds").alias("postcode"),
|
|
pl.col("lat"),
|
|
pl.col("long"),
|
|
)
|
|
|
|
|
|
def process_postcodes() -> pl.LazyFrame:
|
|
"""Process postcodes and add H3 indices at multiple resolutions."""
|
|
df = load_postcodes().collect()
|
|
|
|
for res in H3_RESOLUTIONS:
|
|
col_name = f"h3_res{res}"
|
|
df = df.with_columns(
|
|
pl.struct(["lat", "long"])
|
|
.map_elements(
|
|
lambda x: lat_long_to_h3(x["lat"], x["long"], res),
|
|
return_dtype=pl.Utf8,
|
|
)
|
|
.alias(col_name)
|
|
)
|
|
|
|
return df.lazy()
|
|
|
|
|
|
def save_postcodes(output_path: Path | None = None) -> Path:
|
|
"""Process and save postcodes with H3 indices."""
|
|
output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
df = process_postcodes().collect()
|
|
df.write_parquet(output_path)
|
|
|
|
return output_path
|