Add hexagon backend

2026-01-25 21:07:05 +00:00 · 2026-01-25 21:07:05 +00:00 · ab704c0dc0
commit ab704c0dc0
parent a7cc4d9b2b
18 changed files with 1443 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,3 +2,5 @@ data_sources
 .venv
 .claude
 tfl_journey_client
+**/node_modules
+**/__pycache__
--- a/Taskfile.yml
+++ b/Taskfile.yml
@ -0,0 +1,24 @@
+version: '3'
+
+tasks:
+  install:
+    desc: Install all dependencies
+    cmds:
+      - uv sync
+      - cd frontend && npm install
+
+  pipeline:
+    desc: Run data processing pipeline
+    cmds:
+      - uv run python -m pipeline.run
+
+  server:
+    desc: Run FastAPI backend on port 8001
+    cmds:
+      - uv run fastapi dev server/main.py --port 8001
+  
+  frontend:
+    desc: Run frontend dev server on port 3030 (proxies /api to :8001)
+    dir: frontend
+    cmds:
+      - npm run dev
--- a/pipeline/init.py
+++ b/pipeline/init.py
--- a/pipeline/base.py
+++ b/pipeline/base.py
@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+import polars as pl
+
+
+class DataSource(ABC):
+    """Base class for all data sources."""
+
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Unique identifier for this data source."""
+        pass
+
+    @abstractmethod
+    def load(self) -> pl.LazyFrame:
+        """Load raw data as LazyFrame."""
+        pass
+
+    @abstractmethod
+    def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
+        """Process and join with postcode coordinates."""
+        pass
--- a/pipeline/config.py
+++ b/pipeline/config.py
@ -0,0 +1,23 @@
+"""Shared configuration for the pipeline and server."""
+
+from pathlib import Path
+
+# Data directories
+DATA_DIR = Path(__file__).parent.parent / "data_sources"
+PROCESSED_DIR = DATA_DIR / "processed"
+AGGREGATES_DIR = PROCESSED_DIR / "aggregates"
+
+# H3 resolutions to generate and serve
+# https://h3geo.org/docs/core-library/restable/#average-area-in-m2
+H3_RESOLUTIONS = [6, 7, 8, 9, 10, 11, 12]
+DEFAULT_H3_RESOLUTION = 8
+
+# Year filters
+MIN_YEAR = 1995
+MAX_YEAR = 2024
+DEFAULT_MIN_YEAR = 2020
+DEFAULT_MAX_YEAR = 2024
+
+# Price filters
+DEFAULT_MIN_PRICE = 0
+DEFAULT_MAX_PRICE = 2_000_000
--- a/pipeline/processors/init.py
+++ b/pipeline/processors/init.py
--- a/pipeline/processors/h3_aggregator.py
+++ b/pipeline/processors/h3_aggregator.py
@ -0,0 +1,42 @@
+from pathlib import Path
+import polars as pl
+
+from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS
+
+
+def aggregate(df: pl.LazyFrame, resolution: int) -> pl.LazyFrame:
+    """Aggregate property data by H3 cell and year."""
+    h3_col = f"h3_res{resolution}"
+
+    return (
+        df.group_by(h3_col, "year")
+        .agg(
+            pl.len().alias("count"),
+            pl.col("price").mean().alias("avg_price"),
+            pl.col("price").median().alias("median_price"),
+            pl.col("price").min().alias("min_price"),
+            pl.col("price").max().alias("max_price"),
+        )
+        .rename({h3_col: "h3"})
+    )
+
+
+def aggregate_all(df: pl.LazyFrame) -> dict[int, pl.LazyFrame]:
+    """Aggregate at all H3 resolutions."""
+    return {res: aggregate(df, res) for res in H3_RESOLUTIONS}
+
+
+def save_aggregates(df: pl.LazyFrame, output_dir: Path | None = None) -> list[Path]:
+    """Aggregate and save at all H3 resolutions."""
+    output_dir = output_dir or AGGREGATES_DIR
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    saved_paths = []
+    aggregates = aggregate_all(df)
+
+    for res, agg_df in aggregates.items():
+        output_path = output_dir / f"res{res}.parquet"
+        agg_df.collect().write_parquet(output_path)
+        saved_paths.append(output_path)
+
+    return saved_paths
--- a/pipeline/run.py
+++ b/pipeline/run.py
@ -0,0 +1,36 @@
+"""Pipeline CLI to process property data with H3 spatial indexing."""
+
+from pathlib import Path
+import polars as pl
+from tqdm import tqdm
+
+from pipeline.sources.postcodes import save_postcodes, DATA_DIR
+from pipeline.sources.property_prices import PropertyPricesSource
+from pipeline.processors.h3_aggregator import save_aggregates
+
+
+def run_pipeline():
+    """Run the full data processing pipeline."""
+    print("=" * 60)
+    print("Property Map Data Pipeline")
+    print("=" * 60)
+
+    # Step 1: Process postcodes with H3 indices
+    print("\n[1/3] Processing postcodes with H3 indices...")
+    postcodes_path = save_postcodes()
+    print(f"      Saved: {postcodes_path}")
+
+    print("\n[2/3] Processing property prices...")
+    postcodes = pl.scan_parquet(postcodes_path)
+    property_source = PropertyPricesSource()
+    properties = property_source.process(postcodes)
+    print("      Joined property prices with postcodes")
+
+    print("\n[3/3] Aggregating at H3 resolutions...")
+    saved_paths = save_aggregates(properties)
+    for path in saved_paths:
+        size_mb = path.stat().st_size / (1024 * 1024)
+        print(f"      Saved: {path.name} ({size_mb:.1f} MB)")
+
+if __name__ == "__main__":
+    run_pipeline()
--- a/pipeline/sources/init.py
+++ b/pipeline/sources/init.py
--- a/pipeline/sources/postcodes.py
+++ b/pipeline/sources/postcodes.py
@ -0,0 +1,48 @@
+from pathlib import Path
+import polars as pl
+import h3
+
+from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
+
+
+def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
+    """Convert lat/long to H3 index at given resolution."""
+    return h3.latlng_to_cell(lat, long, resolution)
+
+
+def load_postcodes() -> pl.LazyFrame:
+    """Load postcode data from arcgis parquet file."""
+    return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
+        pl.col("pcds").alias("postcode"),
+        pl.col("lat"),
+        pl.col("long"),
+    )
+
+
+def process_postcodes() -> pl.LazyFrame:
+    """Process postcodes and add H3 indices at multiple resolutions."""
+    df = load_postcodes().collect()
+
+    for res in H3_RESOLUTIONS:
+        col_name = f"h3_res{res}"
+        df = df.with_columns(
+            pl.struct(["lat", "long"])
+            .map_elements(
+                lambda x: lat_long_to_h3(x["lat"], x["long"], res),
+                return_dtype=pl.Utf8,
+            )
+            .alias(col_name)
+        )
+
+    return df.lazy()
+
+
+def save_postcodes(output_path: Path | None = None) -> Path:
+    """Process and save postcodes with H3 indices."""
+    output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    df = process_postcodes().collect()
+    df.write_parquet(output_path)
+
+    return output_path
--- a/pipeline/sources/property_prices.py
+++ b/pipeline/sources/property_prices.py
@ -0,0 +1,41 @@
+import polars as pl
+
+from pipeline.base import DataSource
+from pipeline.config import DATA_DIR, H3_RESOLUTIONS
+
+
+class PropertyPricesSource(DataSource):
+    """Land Registry property prices data source."""
+
+    @property
+    def name(self) -> str:
+        return "property_prices"
+
+    def load(self) -> pl.LazyFrame:
+        """Load raw property prices data."""
+        return pl.scan_parquet(DATA_DIR / "pp-complete.parquet")
+
+    def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
+        """Process and join with postcode coordinates and H3 indices."""
+        prices = self.load().select(
+            pl.col("price"),
+            pl.col("date_of_transfer").dt.year().alias("year"),
+            pl.col("property_type"),
+            pl.col("postcode"),
+        )
+
+        joined = prices.join(
+            postcodes,
+            on="postcode",
+            how="inner",
+        )
+
+        h3_cols = [pl.col(f"h3_res{res}") for res in H3_RESOLUTIONS]
+        return joined.select(
+            pl.col("price"),
+            pl.col("year"),
+            pl.col("property_type"),
+            pl.col("lat"),
+            pl.col("long"),
+            *h3_cols,
+        )
--- a/pyproject.toml
+++ b/pyproject.toml
@ -8,10 +8,16 @@ dependencies = [
    "attrs>=22.2.0",
    "httpx>=0.28.1",
    "journey-client",
+    "numpy>=1.26.0",
+    "pandas>=2.0.0",
    "plotly>=6.5.2",
    "polars>=1.37.1",
+    "pyarrow>=15.0.0",
    "python-dateutil>=2.8.0",
    "tqdm>=4.67.1",
+    "fastapi[standard]>=0.115.0",
+    "uvicorn>=0.34.0",
+    "h3>=3.7.0",
 ]

 [tool.uv.sources]
--- a/server/init.py
+++ b/server/init.py
--- a/server/config.py
+++ b/server/config.py
@ -0,0 +1,25 @@
+"""Server configuration - imports shared values from pipeline config."""
+
+from pipeline.config import (
+    AGGREGATES_DIR,
+    H3_RESOLUTIONS as VALID_RESOLUTIONS,
+    DEFAULT_H3_RESOLUTION as DEFAULT_RESOLUTION,
+    MIN_YEAR,
+    MAX_YEAR,
+    DEFAULT_MIN_YEAR,
+    DEFAULT_MAX_YEAR,
+    DEFAULT_MIN_PRICE,
+    DEFAULT_MAX_PRICE,
+)
+
+__all__ = [
+    "AGGREGATES_DIR",
+    "VALID_RESOLUTIONS",
+    "DEFAULT_RESOLUTION",
+    "MIN_YEAR",
+    "MAX_YEAR",
+    "DEFAULT_MIN_YEAR",
+    "DEFAULT_MAX_YEAR",
+    "DEFAULT_MIN_PRICE",
+    "DEFAULT_MAX_PRICE",
+]
--- a/server/main.py
+++ b/server/main.py
@ -0,0 +1,23 @@
+from pathlib import Path
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+
+from server.routes import hexagons
+
+app = FastAPI(title="Property Map API")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+app.include_router(hexagons.router, prefix="/api")
+
+# Mount static files for production (frontend build)
+frontend_dist = Path(__file__).parent.parent / "frontend" / "dist"
+if frontend_dist.exists():
+    app.mount("/", StaticFiles(directory=frontend_dist, html=True), name="static")
--- a/server/routes/init.py
+++ b/server/routes/init.py
--- a/server/routes/hexagons.py
+++ b/server/routes/hexagons.py
@ -0,0 +1,98 @@
+from typing import Any
+from fastapi import APIRouter, Query
+import polars as pl
+import h3
+
+from server.config import (
+    AGGREGATES_DIR,
+    VALID_RESOLUTIONS,
+    DEFAULT_RESOLUTION,
+    DEFAULT_MIN_YEAR,
+    DEFAULT_MAX_YEAR,
+    DEFAULT_MIN_PRICE,
+    DEFAULT_MAX_PRICE,
+)
+
+router = APIRouter()
+
+
+def h3_to_geojson_feature(h3_index: str, properties: dict[str, Any]) -> dict:
+    """Convert H3 index to GeoJSON feature with polygon geometry."""
+    boundary = h3.cell_to_boundary(h3_index)
+    # h3 returns (lat, lng) pairs, GeoJSON needs [lng, lat]
+    coordinates = [[lng, lat] for lat, lng in boundary]
+    # Close the polygon
+    coordinates.append(coordinates[0])
+
+    return {
+        "type": "Feature",
+        "properties": {"h3": h3_index, **properties},
+        "geometry": {"type": "Polygon", "coordinates": [coordinates]},
+    }
+
+
+@router.get("/hexagons")
+async def get_hexagons(
+    resolution: int = Query(
+        DEFAULT_RESOLUTION,
+        ge=min(VALID_RESOLUTIONS),
+        le=max(VALID_RESOLUTIONS),
+        description=f"H3 resolution ({min(VALID_RESOLUTIONS)}-{max(VALID_RESOLUTIONS)})",
+    ),
+    min_year: int = Query(DEFAULT_MIN_YEAR, description="Minimum year filter"),
+    max_year: int = Query(DEFAULT_MAX_YEAR, description="Maximum year filter"),
+    min_price: float = Query(DEFAULT_MIN_PRICE, description="Minimum average price"),
+    max_price: float = Query(DEFAULT_MAX_PRICE, description="Maximum average price"),
+    bounds: str | None = Query(
+        None, description="Bounding box: lat1,lng1,lat2,lng2"
+    ),
+) -> dict:
+    """Get aggregated property data as GeoJSON hexagons."""
+    if resolution not in VALID_RESOLUTIONS:
+        resolution = DEFAULT_RESOLUTION
+
+    # Load the appropriate resolution file
+    parquet_path = AGGREGATES_DIR / f"res{resolution}.parquet"
+    if not parquet_path.exists():
+        return {"type": "FeatureCollection", "features": []}
+
+    df = pl.scan_parquet(parquet_path)
+
+    # Filter by year range
+    df = df.filter((pl.col("year") >= min_year) & (pl.col("year") <= max_year))
+
+    # Aggregate across years (weighted by count)
+    df = df.group_by("h3").agg(
+        pl.col("count").sum().alias("count"),
+        (pl.col("avg_price") * pl.col("count")).sum().alias("weighted_price_sum"),
+        pl.col("median_price").median().alias("median_price"),
+        pl.col("min_price").min().alias("min_price"),
+        pl.col("max_price").max().alias("max_price"),
+    )
+
+    # Calculate weighted average price
+    df = df.with_columns(
+        (pl.col("weighted_price_sum") / pl.col("count")).alias("avg_price")
+    ).drop("weighted_price_sum")
+
+    # Filter by price range
+    df = df.filter(
+        (pl.col("avg_price") >= min_price) & (pl.col("avg_price") <= max_price)
+    )
+
+    # Collect and convert to GeoJSON
+    result = df.collect()
+
+    features = []
+    for row in result.iter_rows(named=True):
+        h3_index = row["h3"]
+        properties = {
+            "count": row["count"],
+            "avg_price": round(row["avg_price"], 2),
+            "median_price": round(row["median_price"], 2) if row["median_price"] else None,
+            "min_price": row["min_price"],
+            "max_price": row["max_price"],
+        }
+        features.append(h3_to_geojson_feature(h3_index, properties))
+
+    return {"type": "FeatureCollection", "features": features}
--- a/uv.lock
+++ b/uv.lock