Last night

2026-02-08 10:21:37 +00:00 · 2026-02-08 10:21:37 +00:00 · 42ee2d4c51
commit 42ee2d4c51
parent 2906b01734
47 changed files with 848 additions and 478 deletions
--- a/pipeline/journey_times/main.py
+++ b/pipeline/journey_times/main.py
@ -98,10 +98,9 @@ def main():
    if checkpoint_path.exists():
        checkpoint_df = pl.read_parquet(checkpoint_path)
        # Deduplicate checkpoint rows per postcode, preferring rows with data
-        checkpoint_df = (
-            checkpoint_df.sort("public_transport_quick_minutes", nulls_last=True)
-            .unique(subset=["postcode"], keep="first")
-        )
+        checkpoint_df = checkpoint_df.sort(
+            "public_transport_quick_minutes", nulls_last=True
+        ).unique(subset=["postcode"], keep="first")
        completed_postcodes = set(checkpoint_df["postcode"].to_list())
        prior_results = [
            JourneyResult(
@ -145,9 +144,9 @@ def main():
    results_df = results_to_dataframe(all_results)

    all_postcodes = {r.postcode for r in all_results}
-    coords_df = postcodes_df.filter(
-        pl.col("postcode").is_in(all_postcodes)
-    ).select(["postcode", "lat", "long"])
+    coords_df = postcodes_df.filter(pl.col("postcode").is_in(all_postcodes)).select(
+        ["postcode", "lat", "long"]
+    )
    results_df = coords_df.join(results_df, on="postcode", how="left")

    results_df = results_df.with_columns(
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -113,7 +113,12 @@ def _build_wide(
            *[pl.col(c).fill_nan(None) for c in noise_cols],
        )
        .with_columns(
-            pl.max_horizontal(*noise_cols).fill_null(0).alias("noise_lden_db"),
+            pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
+        )
+        .with_columns(
+            pl.col("noise_lden_db")
+            .fill_null(pl.col("noise_lden_db").min())
+            .alias("noise_lden_db"),
        )
        .select("postcode", "noise_lden_db")
    )
@ -153,8 +158,8 @@ def _build_wide(
    wide = wide.with_columns(
        pl.when(pl.col("pp_property_type").is_in(["Terraced", "Semi-Detached"]))
        .then(pl.col("built_form"))
-        .otherwise(pl.col("epc_property_type"))
-        .alias("epc_property_type")
+        .otherwise(pl.col("pp_property_type"))
+        .alias("property_type")
    )

    wide = (
@ -191,12 +196,13 @@ def _build_wide(
            "Barriers to Housing and Services Score",
            "lsoa21",
            "oa21",
+            "epc_property_type",
            "pp_property_type",
            "built_form",
        )
        .rename(
            {
-                "date_of_transfer": "Previous transaction date",
+                "date_of_transfer": "Date of last transaction",
                "construction_age_band": "Construction age",
                "is_construction_date_approximate": "Is construction date approximate",
                "pp_address": "Address per Property Register",
@ -206,11 +212,11 @@ def _build_wide(
                "current_energy_rating": "Current energy rating",
                "potential_energy_rating": "Potential energy rating",
                "total_floor_area": "Total floor area (sqm)",
-                "epc_property_type": "Property type",
-                "restaurants_2km": "Restaurants within 2km",
-                "groceries_2km": "Groceries within 2km",
-                "parks_2km": "Parks within 2km",
-                "public_transport_2km": "Public transport within 2km",
+                "property_type": "Property type",
+                "restaurants_2km": "Number of restaurants within 2km",
+                "groceries_2km": "Number of grocery shops and supermarkets within 2km",
+                "parks_2km": "Number of parks within 2km",
+                "public_transport_2km": "Number of public transport stations within 2km",
                "latest_price": "Last known price",
                "number_habitable_rooms": "Number of bedrooms & living rooms",
                "noise_lden_db": "Noise (dB)",
@ -219,7 +225,6 @@ def _build_wide(
                "max_download_speed": "Max available download speed (Mbps)",
                "serious_crime_avg_yr": "Serious crime (avg/yr)",
                "minor_crime_avg_yr": "Minor crime (avg/yr)",
-                "transaction_year": "Transaction year",
                "environmental_risk": "Environmental risk",
                "collapsible_deposits_risk": "Collapsible deposits risk",
                "compressible_ground_risk": "Compressible ground risk",
--- a/pipeline/transform/postcode_boundaries/process_oa.py
+++ b/pipeline/transform/postcode_boundaries/process_oa.py
@ -42,7 +42,10 @@ def process_oa(

        for pc, polys in pc_inspire_polys.items():
            merged = unary_union(polys)
-            clipped = merged.intersection(oa_geom)
+            if not merged.is_valid:
+                merged = make_valid(merged)
+            valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
+            clipped = merged.intersection(valid_oa)
            if not clipped.is_empty:
                if not clipped.is_valid:
                    clipped = make_valid(clipped)
@ -58,11 +61,13 @@ def process_oa(
        used = None
        for pc, geom in claimed.items():
            if used is not None:
+                if not geom.is_valid:
+                    geom = make_valid(geom)
+                if not used.is_valid:
+                    used = make_valid(used)
                geom = geom.difference(used)
                if geom.is_empty:
                    continue
-                if not geom.is_valid:
-                    geom = make_valid(geom)
                geom = _extract_polygonal(geom)
                if geom is None:
                    continue
@ -75,11 +80,12 @@ def process_oa(
        all_claimed = unary_union(list(claimed.values()))
        if not all_claimed.is_valid:
            all_claimed = make_valid(all_claimed)
-        remaining = oa_geom.difference(all_claimed)
+        valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
+        remaining = valid_oa.difference(all_claimed)
        if not remaining.is_valid:
            remaining = make_valid(remaining)
    else:
-        remaining = oa_geom
+        remaining = oa_geom if oa_geom.is_valid else make_valid(oa_geom)

    # Distribute remaining area via Voronoi
    if not remaining.is_empty and remaining.area > 0.01:
--- a/pipeline/transform/postcode_boundaries/voronoi.py
+++ b/pipeline/transform/postcode_boundaries/voronoi.py
@ -75,6 +75,9 @@ def compute_voronoi_regions(
    n_real = len(pts)
    pc_polys: dict[str, list[Polygon]] = defaultdict(list)

+    if not boundary.is_valid:
+        boundary = make_valid(boundary)
+
    for i in range(n_real):
        region_idx = vor.point_region[i]
        region = vor.regions[region_idx]
@ -100,6 +103,8 @@ def _equal_split_fallback(
    postcodes: list[str], boundary: Polygon | MultiPolygon
 ) -> dict[str, Polygon | MultiPolygon]:
    """Split boundary into roughly equal horizontal strips, one per postcode."""
+    if not boundary.is_valid:
+        boundary = make_valid(boundary)
    min_x, min_y, max_x, max_y = boundary.bounds
    n = len(postcodes)
    result = {}
--- a/pipeline/utils/poi_counts.py
+++ b/pipeline/utils/poi_counts.py
@ -1,14 +1,12 @@
 """Count POIs within a radius of properties, optimized via postcode deduplication."""

-import tempfile
-
 import numpy as np
 import polars as pl

 from .haversine import haversine_km


-def _count_pois_per_postcode(
+def count_pois_per_postcode(
    postcodes_df: pl.DataFrame,
    pois: pl.DataFrame,
    groups: dict[str, list[str]],
@ -64,9 +62,7 @@ def _count_pois_per_postcode(
    pc_codes = postcodes_df["postcode"].to_list()

    # Initialize result arrays
-    result_counts = {
-        group: np.zeros(n_postcodes, dtype=np.int32) for group in groups
-    }
+    result_counts = {group: np.zeros(n_postcodes, dtype=np.int32) for group in groups}

    # Process in batches with progress
    batch_size = 50000
@ -128,47 +124,3 @@ def _count_pois_per_postcode(
    result = pl.DataFrame(result_data)
    print("  Completed POI counting")
    return result
-
-
-def count_pois_within_radius(
-    properties: pl.DataFrame, pois: pl.DataFrame, radius_km: float = 2.0
-) -> dict[str, pl.Series]:
-    """
-    Count POIs within radius for properties, optimized by deduplicating postcodes.
-
-    Returns dict of {column_name: count_series} aligned to properties dataframe.
-    """
-    # Get unique postcodes with coordinates
-    print("Deduplicating postcodes...")
-    unique_postcodes = properties.select(["postcode", "lat", "lon"]).unique(
-        subset=["postcode"]
-    )
-
-    print(
-        f"  {len(properties):,} properties → {len(unique_postcodes):,} unique postcodes"
-    )
-
-    # Count POIs per postcode
-    postcode_counts = _count_pois_per_postcode(unique_postcodes, pois, radius_km)
-
-    print("  Writing postcode counts to temp file...")
-    with tempfile.NamedTemporaryFile(suffix=".parquet") as tmp:
-        tmp_path = tmp.name
-        postcode_counts.write_parquet(tmp_path)
-
-        # Join using lazy evaluation
-        print("  Joining counts back to properties (lazy)...")
-        count_cols = [f"{group}_{int(radius_km)}km" for group in POI_GROUPS]
-
-        # Convert properties to lazy frame, join, then collect
-        result_lazy = (
-            properties.lazy()
-            .select("postcode")
-            .join(pl.scan_parquet(tmp_path), on="postcode", how="left")
-            .select(count_cols)
-            .fill_null(0)
-        )
-
-        result_df = result_lazy.collect(engine="streaming")
-
-        return {col: result_df[col] for col in count_cols}