"""Pipeline CLI to process property data with H3 spatial indexing.""" import polars as pl from pipeline.sources.postcodes import save_postcodes from pipeline.sources.property_prices import PropertyPricesSource from pipeline.processors.h3_aggregator import save_aggregates from pipeline.processors.journey_times_aggregator import aggregate_journey_times def run_pipeline(): """Run the full data processing pipeline.""" print("=" * 60) print("Property Map Data Pipeline") print("=" * 60) # Step 1: Process postcodes with H3 indices print("\n[1/4] Processing postcodes with H3 indices...") postcodes_path = save_postcodes() print(f" Saved: {postcodes_path}") print("\n[2/4] Processing property prices...") postcodes = pl.scan_parquet(postcodes_path) property_source = PropertyPricesSource() properties = property_source.process(postcodes) print(" Joined property prices with postcodes") print("\n[3/4] Aggregating at H3 resolutions...") saved_paths = save_aggregates(properties) for path in saved_paths: size_mb = path.stat().st_size / (1024 * 1024) print(f" Saved: {path.name} ({size_mb:.1f} MB)") print("\n[4/4] Adding journey times to aggregates...") updated_paths = aggregate_journey_times() if updated_paths: for path in updated_paths: size_mb = path.stat().st_size / (1024 * 1024) print(f" Updated: {path.name} ({size_mb:.1f} MB)") else: print(" Skipped (no journey time data found)") if __name__ == "__main__": run_pipeline()