36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
"""Pipeline CLI to process property data with H3 spatial indexing."""
|
|
|
|
from pathlib import Path
|
|
import polars as pl
|
|
from tqdm import tqdm
|
|
|
|
from pipeline.sources.postcodes import save_postcodes, DATA_DIR
|
|
from pipeline.sources.property_prices import PropertyPricesSource
|
|
from pipeline.processors.h3_aggregator import save_aggregates
|
|
|
|
|
|
def run_pipeline():
|
|
"""Run the full data processing pipeline."""
|
|
print("=" * 60)
|
|
print("Property Map Data Pipeline")
|
|
print("=" * 60)
|
|
|
|
# Step 1: Process postcodes with H3 indices
|
|
print("\n[1/3] Processing postcodes with H3 indices...")
|
|
postcodes_path = save_postcodes()
|
|
print(f" Saved: {postcodes_path}")
|
|
|
|
print("\n[2/3] Processing property prices...")
|
|
postcodes = pl.scan_parquet(postcodes_path)
|
|
property_source = PropertyPricesSource()
|
|
properties = property_source.process(postcodes)
|
|
print(" Joined property prices with postcodes")
|
|
|
|
print("\n[3/3] Aggregating at H3 resolutions...")
|
|
saved_paths = save_aggregates(properties)
|
|
for path in saved_paths:
|
|
size_mb = path.stat().st_size / (1024 * 1024)
|
|
print(f" Saved: {path.name} ({size_mb:.1f} MB)")
|
|
|
|
if __name__ == "__main__":
|
|
run_pipeline()
|