"""Pipeline CLI to process property data with H3 spatial indexing.""" from pathlib import Path import polars as pl from tqdm import tqdm from pipeline.sources.postcodes import save_postcodes, DATA_DIR from pipeline.sources.property_prices import PropertyPricesSource from pipeline.processors.h3_aggregator import save_aggregates def run_pipeline(): """Run the full data processing pipeline.""" print("=" * 60) print("Property Map Data Pipeline") print("=" * 60) # Step 1: Process postcodes with H3 indices print("\n[1/3] Processing postcodes with H3 indices...") postcodes_path = save_postcodes() print(f" Saved: {postcodes_path}") print("\n[2/3] Processing property prices...") postcodes = pl.scan_parquet(postcodes_path) property_source = PropertyPricesSource() properties = property_source.process(postcodes) print(" Joined property prices with postcodes") print("\n[3/3] Aggregating at H3 resolutions...") saved_paths = save_aggregates(properties) for path in saved_paths: size_mb = path.stat().st_size / (1024 * 1024) print(f" Saved: {path.name} ({size_mb:.1f} MB)") if __name__ == "__main__": run_pipeline()