Add hexagon backend
This commit is contained in:
parent
a7cc4d9b2b
commit
ab704c0dc0
18 changed files with 1443 additions and 0 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
|
@ -2,3 +2,5 @@ data_sources
|
|||
.venv
|
||||
.claude
|
||||
tfl_journey_client
|
||||
**/node_modules
|
||||
**/__pycache__
|
||||
24
Taskfile.yml
Normal file
24
Taskfile.yml
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
version: '3'
|
||||
|
||||
tasks:
|
||||
install:
|
||||
desc: Install all dependencies
|
||||
cmds:
|
||||
- uv sync
|
||||
- cd frontend && npm install
|
||||
|
||||
pipeline:
|
||||
desc: Run data processing pipeline
|
||||
cmds:
|
||||
- uv run python -m pipeline.run
|
||||
|
||||
server:
|
||||
desc: Run FastAPI backend on port 8001
|
||||
cmds:
|
||||
- uv run fastapi dev server/main.py --port 8001
|
||||
|
||||
frontend:
|
||||
desc: Run frontend dev server on port 3030 (proxies /api to :8001)
|
||||
dir: frontend
|
||||
cmds:
|
||||
- npm run dev
|
||||
0
pipeline/__init__.py
Normal file
0
pipeline/__init__.py
Normal file
22
pipeline/base.py
Normal file
22
pipeline/base.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
from abc import ABC, abstractmethod
|
||||
import polars as pl
|
||||
|
||||
|
||||
class DataSource(ABC):
|
||||
"""Base class for all data sources."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def name(self) -> str:
|
||||
"""Unique identifier for this data source."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def load(self) -> pl.LazyFrame:
|
||||
"""Load raw data as LazyFrame."""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Process and join with postcode coordinates."""
|
||||
pass
|
||||
23
pipeline/config.py
Normal file
23
pipeline/config.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
"""Shared configuration for the pipeline and server."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Data directories
|
||||
DATA_DIR = Path(__file__).parent.parent / "data_sources"
|
||||
PROCESSED_DIR = DATA_DIR / "processed"
|
||||
AGGREGATES_DIR = PROCESSED_DIR / "aggregates"
|
||||
|
||||
# H3 resolutions to generate and serve
|
||||
# https://h3geo.org/docs/core-library/restable/#average-area-in-m2
|
||||
H3_RESOLUTIONS = [6, 7, 8, 9, 10, 11, 12]
|
||||
DEFAULT_H3_RESOLUTION = 8
|
||||
|
||||
# Year filters
|
||||
MIN_YEAR = 1995
|
||||
MAX_YEAR = 2024
|
||||
DEFAULT_MIN_YEAR = 2020
|
||||
DEFAULT_MAX_YEAR = 2024
|
||||
|
||||
# Price filters
|
||||
DEFAULT_MIN_PRICE = 0
|
||||
DEFAULT_MAX_PRICE = 2_000_000
|
||||
0
pipeline/processors/__init__.py
Normal file
0
pipeline/processors/__init__.py
Normal file
42
pipeline/processors/h3_aggregator.py
Normal file
42
pipeline/processors/h3_aggregator.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
from pathlib import Path
|
||||
import polars as pl
|
||||
|
||||
from pipeline.config import AGGREGATES_DIR, H3_RESOLUTIONS
|
||||
|
||||
|
||||
def aggregate(df: pl.LazyFrame, resolution: int) -> pl.LazyFrame:
|
||||
"""Aggregate property data by H3 cell and year."""
|
||||
h3_col = f"h3_res{resolution}"
|
||||
|
||||
return (
|
||||
df.group_by(h3_col, "year")
|
||||
.agg(
|
||||
pl.len().alias("count"),
|
||||
pl.col("price").mean().alias("avg_price"),
|
||||
pl.col("price").median().alias("median_price"),
|
||||
pl.col("price").min().alias("min_price"),
|
||||
pl.col("price").max().alias("max_price"),
|
||||
)
|
||||
.rename({h3_col: "h3"})
|
||||
)
|
||||
|
||||
|
||||
def aggregate_all(df: pl.LazyFrame) -> dict[int, pl.LazyFrame]:
|
||||
"""Aggregate at all H3 resolutions."""
|
||||
return {res: aggregate(df, res) for res in H3_RESOLUTIONS}
|
||||
|
||||
|
||||
def save_aggregates(df: pl.LazyFrame, output_dir: Path | None = None) -> list[Path]:
|
||||
"""Aggregate and save at all H3 resolutions."""
|
||||
output_dir = output_dir or AGGREGATES_DIR
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
saved_paths = []
|
||||
aggregates = aggregate_all(df)
|
||||
|
||||
for res, agg_df in aggregates.items():
|
||||
output_path = output_dir / f"res{res}.parquet"
|
||||
agg_df.collect().write_parquet(output_path)
|
||||
saved_paths.append(output_path)
|
||||
|
||||
return saved_paths
|
||||
36
pipeline/run.py
Normal file
36
pipeline/run.py
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
"""Pipeline CLI to process property data with H3 spatial indexing."""
|
||||
|
||||
from pathlib import Path
|
||||
import polars as pl
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.sources.postcodes import save_postcodes, DATA_DIR
|
||||
from pipeline.sources.property_prices import PropertyPricesSource
|
||||
from pipeline.processors.h3_aggregator import save_aggregates
|
||||
|
||||
|
||||
def run_pipeline():
|
||||
"""Run the full data processing pipeline."""
|
||||
print("=" * 60)
|
||||
print("Property Map Data Pipeline")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Process postcodes with H3 indices
|
||||
print("\n[1/3] Processing postcodes with H3 indices...")
|
||||
postcodes_path = save_postcodes()
|
||||
print(f" Saved: {postcodes_path}")
|
||||
|
||||
print("\n[2/3] Processing property prices...")
|
||||
postcodes = pl.scan_parquet(postcodes_path)
|
||||
property_source = PropertyPricesSource()
|
||||
properties = property_source.process(postcodes)
|
||||
print(" Joined property prices with postcodes")
|
||||
|
||||
print("\n[3/3] Aggregating at H3 resolutions...")
|
||||
saved_paths = save_aggregates(properties)
|
||||
for path in saved_paths:
|
||||
size_mb = path.stat().st_size / (1024 * 1024)
|
||||
print(f" Saved: {path.name} ({size_mb:.1f} MB)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_pipeline()
|
||||
0
pipeline/sources/__init__.py
Normal file
0
pipeline/sources/__init__.py
Normal file
48
pipeline/sources/postcodes.py
Normal file
48
pipeline/sources/postcodes.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
from pathlib import Path
|
||||
import polars as pl
|
||||
import h3
|
||||
|
||||
from pipeline.config import DATA_DIR, H3_RESOLUTIONS, PROCESSED_DIR
|
||||
|
||||
|
||||
def lat_long_to_h3(lat: float, long: float, resolution: int) -> str:
|
||||
"""Convert lat/long to H3 index at given resolution."""
|
||||
return h3.latlng_to_cell(lat, long, resolution)
|
||||
|
||||
|
||||
def load_postcodes() -> pl.LazyFrame:
|
||||
"""Load postcode data from arcgis parquet file."""
|
||||
return pl.scan_parquet(DATA_DIR / "arcgis_data.parquet").select(
|
||||
pl.col("pcds").alias("postcode"),
|
||||
pl.col("lat"),
|
||||
pl.col("long"),
|
||||
)
|
||||
|
||||
|
||||
def process_postcodes() -> pl.LazyFrame:
|
||||
"""Process postcodes and add H3 indices at multiple resolutions."""
|
||||
df = load_postcodes().collect()
|
||||
|
||||
for res in H3_RESOLUTIONS:
|
||||
col_name = f"h3_res{res}"
|
||||
df = df.with_columns(
|
||||
pl.struct(["lat", "long"])
|
||||
.map_elements(
|
||||
lambda x: lat_long_to_h3(x["lat"], x["long"], res),
|
||||
return_dtype=pl.Utf8,
|
||||
)
|
||||
.alias(col_name)
|
||||
)
|
||||
|
||||
return df.lazy()
|
||||
|
||||
|
||||
def save_postcodes(output_path: Path | None = None) -> Path:
|
||||
"""Process and save postcodes with H3 indices."""
|
||||
output_path = output_path or PROCESSED_DIR / "postcodes_h3.parquet"
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
df = process_postcodes().collect()
|
||||
df.write_parquet(output_path)
|
||||
|
||||
return output_path
|
||||
41
pipeline/sources/property_prices.py
Normal file
41
pipeline/sources/property_prices.py
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
import polars as pl
|
||||
|
||||
from pipeline.base import DataSource
|
||||
from pipeline.config import DATA_DIR, H3_RESOLUTIONS
|
||||
|
||||
|
||||
class PropertyPricesSource(DataSource):
|
||||
"""Land Registry property prices data source."""
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
return "property_prices"
|
||||
|
||||
def load(self) -> pl.LazyFrame:
|
||||
"""Load raw property prices data."""
|
||||
return pl.scan_parquet(DATA_DIR / "pp-complete.parquet")
|
||||
|
||||
def process(self, postcodes: pl.LazyFrame) -> pl.LazyFrame:
|
||||
"""Process and join with postcode coordinates and H3 indices."""
|
||||
prices = self.load().select(
|
||||
pl.col("price"),
|
||||
pl.col("date_of_transfer").dt.year().alias("year"),
|
||||
pl.col("property_type"),
|
||||
pl.col("postcode"),
|
||||
)
|
||||
|
||||
joined = prices.join(
|
||||
postcodes,
|
||||
on="postcode",
|
||||
how="inner",
|
||||
)
|
||||
|
||||
h3_cols = [pl.col(f"h3_res{res}") for res in H3_RESOLUTIONS]
|
||||
return joined.select(
|
||||
pl.col("price"),
|
||||
pl.col("year"),
|
||||
pl.col("property_type"),
|
||||
pl.col("lat"),
|
||||
pl.col("long"),
|
||||
*h3_cols,
|
||||
)
|
||||
|
|
@ -8,10 +8,16 @@ dependencies = [
|
|||
"attrs>=22.2.0",
|
||||
"httpx>=0.28.1",
|
||||
"journey-client",
|
||||
"numpy>=1.26.0",
|
||||
"pandas>=2.0.0",
|
||||
"plotly>=6.5.2",
|
||||
"polars>=1.37.1",
|
||||
"pyarrow>=15.0.0",
|
||||
"python-dateutil>=2.8.0",
|
||||
"tqdm>=4.67.1",
|
||||
"fastapi[standard]>=0.115.0",
|
||||
"uvicorn>=0.34.0",
|
||||
"h3>=3.7.0",
|
||||
]
|
||||
|
||||
[tool.uv.sources]
|
||||
|
|
|
|||
0
server/__init__.py
Normal file
0
server/__init__.py
Normal file
25
server/config.py
Normal file
25
server/config.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
"""Server configuration - imports shared values from pipeline config."""
|
||||
|
||||
from pipeline.config import (
|
||||
AGGREGATES_DIR,
|
||||
H3_RESOLUTIONS as VALID_RESOLUTIONS,
|
||||
DEFAULT_H3_RESOLUTION as DEFAULT_RESOLUTION,
|
||||
MIN_YEAR,
|
||||
MAX_YEAR,
|
||||
DEFAULT_MIN_YEAR,
|
||||
DEFAULT_MAX_YEAR,
|
||||
DEFAULT_MIN_PRICE,
|
||||
DEFAULT_MAX_PRICE,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"AGGREGATES_DIR",
|
||||
"VALID_RESOLUTIONS",
|
||||
"DEFAULT_RESOLUTION",
|
||||
"MIN_YEAR",
|
||||
"MAX_YEAR",
|
||||
"DEFAULT_MIN_YEAR",
|
||||
"DEFAULT_MAX_YEAR",
|
||||
"DEFAULT_MIN_PRICE",
|
||||
"DEFAULT_MAX_PRICE",
|
||||
]
|
||||
23
server/main.py
Normal file
23
server/main.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
from pathlib import Path
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from server.routes import hexagons
|
||||
|
||||
app = FastAPI(title="Property Map API")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(hexagons.router, prefix="/api")
|
||||
|
||||
# Mount static files for production (frontend build)
|
||||
frontend_dist = Path(__file__).parent.parent / "frontend" / "dist"
|
||||
if frontend_dist.exists():
|
||||
app.mount("/", StaticFiles(directory=frontend_dist, html=True), name="static")
|
||||
0
server/routes/__init__.py
Normal file
0
server/routes/__init__.py
Normal file
98
server/routes/hexagons.py
Normal file
98
server/routes/hexagons.py
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
from typing import Any
|
||||
from fastapi import APIRouter, Query
|
||||
import polars as pl
|
||||
import h3
|
||||
|
||||
from server.config import (
|
||||
AGGREGATES_DIR,
|
||||
VALID_RESOLUTIONS,
|
||||
DEFAULT_RESOLUTION,
|
||||
DEFAULT_MIN_YEAR,
|
||||
DEFAULT_MAX_YEAR,
|
||||
DEFAULT_MIN_PRICE,
|
||||
DEFAULT_MAX_PRICE,
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def h3_to_geojson_feature(h3_index: str, properties: dict[str, Any]) -> dict:
|
||||
"""Convert H3 index to GeoJSON feature with polygon geometry."""
|
||||
boundary = h3.cell_to_boundary(h3_index)
|
||||
# h3 returns (lat, lng) pairs, GeoJSON needs [lng, lat]
|
||||
coordinates = [[lng, lat] for lat, lng in boundary]
|
||||
# Close the polygon
|
||||
coordinates.append(coordinates[0])
|
||||
|
||||
return {
|
||||
"type": "Feature",
|
||||
"properties": {"h3": h3_index, **properties},
|
||||
"geometry": {"type": "Polygon", "coordinates": [coordinates]},
|
||||
}
|
||||
|
||||
|
||||
@router.get("/hexagons")
|
||||
async def get_hexagons(
|
||||
resolution: int = Query(
|
||||
DEFAULT_RESOLUTION,
|
||||
ge=min(VALID_RESOLUTIONS),
|
||||
le=max(VALID_RESOLUTIONS),
|
||||
description=f"H3 resolution ({min(VALID_RESOLUTIONS)}-{max(VALID_RESOLUTIONS)})",
|
||||
),
|
||||
min_year: int = Query(DEFAULT_MIN_YEAR, description="Minimum year filter"),
|
||||
max_year: int = Query(DEFAULT_MAX_YEAR, description="Maximum year filter"),
|
||||
min_price: float = Query(DEFAULT_MIN_PRICE, description="Minimum average price"),
|
||||
max_price: float = Query(DEFAULT_MAX_PRICE, description="Maximum average price"),
|
||||
bounds: str | None = Query(
|
||||
None, description="Bounding box: lat1,lng1,lat2,lng2"
|
||||
),
|
||||
) -> dict:
|
||||
"""Get aggregated property data as GeoJSON hexagons."""
|
||||
if resolution not in VALID_RESOLUTIONS:
|
||||
resolution = DEFAULT_RESOLUTION
|
||||
|
||||
# Load the appropriate resolution file
|
||||
parquet_path = AGGREGATES_DIR / f"res{resolution}.parquet"
|
||||
if not parquet_path.exists():
|
||||
return {"type": "FeatureCollection", "features": []}
|
||||
|
||||
df = pl.scan_parquet(parquet_path)
|
||||
|
||||
# Filter by year range
|
||||
df = df.filter((pl.col("year") >= min_year) & (pl.col("year") <= max_year))
|
||||
|
||||
# Aggregate across years (weighted by count)
|
||||
df = df.group_by("h3").agg(
|
||||
pl.col("count").sum().alias("count"),
|
||||
(pl.col("avg_price") * pl.col("count")).sum().alias("weighted_price_sum"),
|
||||
pl.col("median_price").median().alias("median_price"),
|
||||
pl.col("min_price").min().alias("min_price"),
|
||||
pl.col("max_price").max().alias("max_price"),
|
||||
)
|
||||
|
||||
# Calculate weighted average price
|
||||
df = df.with_columns(
|
||||
(pl.col("weighted_price_sum") / pl.col("count")).alias("avg_price")
|
||||
).drop("weighted_price_sum")
|
||||
|
||||
# Filter by price range
|
||||
df = df.filter(
|
||||
(pl.col("avg_price") >= min_price) & (pl.col("avg_price") <= max_price)
|
||||
)
|
||||
|
||||
# Collect and convert to GeoJSON
|
||||
result = df.collect()
|
||||
|
||||
features = []
|
||||
for row in result.iter_rows(named=True):
|
||||
h3_index = row["h3"]
|
||||
properties = {
|
||||
"count": row["count"],
|
||||
"avg_price": round(row["avg_price"], 2),
|
||||
"median_price": round(row["median_price"], 2) if row["median_price"] else None,
|
||||
"min_price": row["min_price"],
|
||||
"max_price": row["max_price"],
|
||||
}
|
||||
features.append(h3_to_geojson_feature(h3_index, properties))
|
||||
|
||||
return {"type": "FeatureCollection", "features": features}
|
||||
Loading…
Add table
Add a link
Reference in a new issue