Fmt

2026-03-15 21:22:28 +00:00 · 2026-03-15 21:22:28 +00:00 · c38d654ac7
commit c38d654ac7
parent 479ef92236
44 changed files with 2526 additions and 701 deletions
--- a/pipeline/download/lsoa_population.py
+++ b/pipeline/download/lsoa_population.py
@ -40,7 +40,9 @@ def download_and_convert(output_path: Path) -> None:
    df = pl.concat(frames)
    print(f"Total rows: {df.height}")

-    result = df.rename({"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}).with_columns(
+    result = df.rename(
+        {"GEOGRAPHY_CODE": "lsoa21", "OBS_VALUE": "population"}
+    ).with_columns(
        pl.col("population").cast(pl.UInt32),
    )

@ -48,7 +50,9 @@ def download_and_convert(output_path: Path) -> None:
    result = result.filter(pl.col("lsoa21").str.starts_with("E"))

    print(f"England LSOAs: {result.height}")
-    print(f"Population range: {result['population'].min()} - {result['population'].max()}")
+    print(
+        f"Population range: {result['population'].min()} - {result['population'].max()}"
+    )
    print(f"Mean population: {result['population'].mean():.0f}")

    output_path.parent.mkdir(parents=True, exist_ok=True)
--- a/pipeline/download/places.py
+++ b/pipeline/download/places.py
@ -119,7 +119,11 @@ class PlaceHandler(osmium.SimpleHandler):
            station_tag = tags.get("station", "")
            network = tags.get("network", "").lower()
            # Skip tram stops
-            if station_tag == "light_rail" or "tramlink" in network or "tram" in network:
+            if (
+                station_tag == "light_rail"
+                or "tramlink" in network
+                or "tram" in network
+            ):
                return
            display_name = _station_display_name(name, tags)
            self._add(display_name, "station", lat, lon, population)
@ -131,9 +135,7 @@ def main() -> None:
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
-    parser.add_argument(
-        "--pbf", type=Path, required=True, help="Path to OSM PBF file"
-    )
+    parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
    parser.add_argument(
        "--boundary",
        type=Path,
--- a/pipeline/download/pois.py
+++ b/pipeline/download/pois.py
@ -111,9 +111,7 @@ def main() -> None:
    parser.add_argument(
        "--output", type=Path, required=True, help="Output parquet file path"
    )
-    parser.add_argument(
-        "--pbf", type=Path, required=True, help="Path to OSM PBF file"
-    )
+    parser.add_argument("--pbf", type=Path, required=True, help="Path to OSM PBF file")
    parser.add_argument(
        "--boundary",
        type=Path,
--- a/pipeline/download/rental_prices.py
+++ b/pipeline/download/rental_prices.py
@ -99,10 +99,14 @@ def convert_to_parquet(xls_path: Path, parquet_path: Path) -> None:
    combined = pl.concat(frames)

    # Remap old LA codes to new unitary authority codes and average medians
-    combined = combined.with_columns(
-        pl.col("area_code").replace(LA_CONSOLIDATION),
-    ).group_by("area_code", "bedrooms").agg(
-        pl.col("median_monthly_rent").mean(),
+    combined = (
+        combined.with_columns(
+            pl.col("area_code").replace(LA_CONSOLIDATION),
+        )
+        .group_by("area_code", "bedrooms")
+        .agg(
+            pl.col("median_monthly_rent").mean(),
+        )
    )

    print(f"Combined: {combined.shape}")
--- a/pipeline/download/rightmove_outcodes.py
+++ b/pipeline/download/rightmove_outcodes.py
@ -13,9 +13,7 @@ TYPEAHEAD_URL = "https://los.rightmove.co.uk/typeahead"

 def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
    df = pl.read_parquet(postcodes_path, columns=["Postcode"])
-    outcodes = sorted(
-        set(df["Postcode"].str.split(" ").list.first().to_list()) - {""}
-    )
+    outcodes = sorted(set(df["Postcode"].str.split(" ").list.first().to_list()) - {""})
    print(f"Querying Rightmove typeahead for {len(outcodes)} outcodes...")

    mapping: dict[str, str] = {}
@ -28,11 +26,9 @@ def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:
            data = resp.json()
            found = False
            for m in data.get("matches", []):
-                if (
-                    m["type"] == "OUTCODE"
-                    and m["displayName"].upper().replace(" ", "")
-                    == oc.upper().replace(" ", "")
-                ):
+                if m["type"] == "OUTCODE" and m["displayName"].upper().replace(
+                    " ", ""
+                ) == oc.upper().replace(" ", ""):
                    mapping[oc] = str(m["id"])
                    found = True
                    break
@ -57,9 +53,7 @@ def fetch_outcode_ids(postcodes_path: Path, output: Path) -> None:


 def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Fetch Rightmove outcode ID mapping"
-    )
+    parser = argparse.ArgumentParser(description="Fetch Rightmove outcode ID mapping")
    parser.add_argument(
        "--postcodes", type=Path, required=True, help="postcode.parquet path"
    )
--- a/pipeline/download/tiles.py
+++ b/pipeline/download/tiles.py
@ -64,7 +64,9 @@ def ensure_pmtiles_cli(bin_path: Path, version: str) -> None:

 def main():
    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument("--output", type=Path, required=True, help="Output .pmtiles path")
+    parser.add_argument(
+        "--output", type=Path, required=True, help="Output .pmtiles path"
+    )
    parser.add_argument(
        "--pmtiles-version", default="1.22.3", help="go-pmtiles release version"
    )
--- a/pipeline/download/transit_network.py
+++ b/pipeline/download/transit_network.py
@ -56,7 +56,9 @@ NR_TIMETABLE_URL = "https://opendata.nationalrail.co.uk/api/staticfeeds/3.0/time
 USER_AGENT = "property-map-pipeline/1.0 (https://github.com)"


-def _download_http(url: str, dest: Path, *, desc: str, headers: dict | None = None) -> None:
+def _download_http(
+    url: str, dest: Path, *, desc: str, headers: dict | None = None
+) -> None:
    """Stream-download a URL to a file with progress bar."""
    dest.parent.mkdir(parents=True, exist_ok=True)
    tmp = dest.with_suffix(dest.suffix + ".tmp")
@ -117,9 +119,10 @@ def clean_gtfs(src: Path, dst: Path) -> None:
        return

    print("Cleaning GTFS for R5 compatibility...")
-    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
-        dst, "w", zipfile.ZIP_DEFLATED
-    ) as zout:
+    with (
+        zipfile.ZipFile(src, "r") as zin,
+        zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout,
+    ):
        for info in zin.infolist():
            if info.filename == "stop_times.txt":
                dropped = 0
@ -127,7 +130,9 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                    header = f.readline()
                    header_str = header.decode("utf-8").strip()
                    cols = header_str.split(",")
-                    arr_idx = cols.index("arrival_time") if "arrival_time" in cols else -1
+                    arr_idx = (
+                        cols.index("arrival_time") if "arrival_time" in cols else -1
+                    )
                    dep_idx = (
                        cols.index("departure_time") if "departure_time" in cols else -1
                    )
@ -179,7 +184,9 @@ def clean_gtfs(src: Path, dst: Path) -> None:
                                year = int(date_val[:4])
                                if year > 2100:
                                    parts[i] = "20991231"
-                                    print(f"  feed_info: capped end_date {date_val} → 20991231")
+                                    print(
+                                        f"  feed_info: capped end_date {date_val} → 20991231"
+                                    )
                    fixed_lines.append(",".join(parts))
                zout.writestr("feed_info.txt", "\n".join(fixed_lines) + "\n")
            else:
@ -334,7 +341,9 @@ def convert_high_freq_to_frequency_based(
        end_secs = trips[-1][1] + int(median_hw)
        headway_rounded = max(60, round(median_hw / 60) * 60)

-        frequency_entries.append((template_trip_id, start_secs, end_secs, headway_rounded))
+        frequency_entries.append(
+            (template_trip_id, start_secs, end_secs, headway_rounded)
+        )
        for trip_id, _ in trips[1:]:
            trips_to_remove.add(trip_id)
        groups_converted += 1
@ -344,9 +353,10 @@ def convert_high_freq_to_frequency_based(
    print(f"  Created {len(frequency_entries)} frequency entries")

    # Step 5: Write modified GTFS
-    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
-        dst, "w", zipfile.ZIP_DEFLATED
-    ) as zout:
+    with (
+        zipfile.ZipFile(src, "r") as zin,
+        zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout,
+    ):
        for info in zin.infolist():
            if info.filename == "trips.txt":
                with zin.open(info) as f:
@ -466,15 +476,22 @@ def download_national_rail_cif(raw_dir: Path) -> Path | None:
    email = os.environ.get("NATIONAL_RAIL_EMAIL")
    password = os.environ.get("NATIONAL_RAIL_PASSWORD")
    if not email or not password:
-        print("Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail")
+        print(
+            "Warning: NATIONAL_RAIL_EMAIL/NATIONAL_RAIL_PASSWORD not set, skipping national rail"
+        )
        return None

    print("Authenticating with National Rail Open Data...")
-    auth_data = urllib.parse.urlencode({"username": email, "password": password}).encode()
+    auth_data = urllib.parse.urlencode(
+        {"username": email, "password": password}
+    ).encode()
    auth_req = urllib.request.Request(
        NR_AUTH_URL,
        data=auth_data,
-        headers={"User-Agent": USER_AGENT, "Content-Type": "application/x-www-form-urlencoded"},
+        headers={
+            "User-Agent": USER_AGENT,
+            "Content-Type": "application/x-www-form-urlencoded",
+        },
    )
    with urllib.request.urlopen(auth_req) as resp:
        token_data = json.loads(resp.read())
@ -565,9 +582,10 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
    coords_fixed = 0
    route_types_fixed = 0

-    with zipfile.ZipFile(src, "r") as zin, zipfile.ZipFile(
-        dst, "w", zipfile.ZIP_DEFLATED
-    ) as zout:
+    with (
+        zipfile.ZipFile(src, "r") as zin,
+        zipfile.ZipFile(dst, "w", zipfile.ZIP_DEFLATED) as zout,
+    ):
        for info in zin.infolist():
            # Skip non-standard links.txt
            if info.filename == "links.txt":
@ -581,8 +599,12 @@ def clean_national_rail_gtfs(src: Path, dst: Path) -> None:
                    trip_id_idx = cols.index("trip_id")
                    stop_id_idx = cols.index("stop_id")
                    seq_idx = cols.index("stop_sequence")
-                    pickup_idx = cols.index("pickup_type") if "pickup_type" in cols else -1
-                    dropoff_idx = cols.index("drop_off_type") if "drop_off_type" in cols else -1
+                    pickup_idx = (
+                        cols.index("pickup_type") if "pickup_type" in cols else -1
+                    )
+                    dropoff_idx = (
+                        cols.index("drop_off_type") if "drop_off_type" in cols else -1
+                    )

                    tmp = tempfile.NamedTemporaryFile(
                        mode="wb", delete=False, suffix=".txt"
@ -769,16 +791,27 @@ def _docker_run_dtd2mysql(
 ) -> None:
    """Run dtd2mysql in a Node.js container on the same Docker network as MariaDB."""
    cmd = [
-        "docker", "run", "--rm", "--network", network,
-        "-e", f"DATABASE_HOSTNAME={db_container}",
-        "-e", "DATABASE_USERNAME=root",
-        "-e", "DATABASE_PASSWORD=root",
-        "-e", "DATABASE_NAME=dtd",
+        "docker",
+        "run",
+        "--rm",
+        "--network",
+        network,
+        "-e",
+        f"DATABASE_HOSTNAME={db_container}",
+        "-e",
+        "DATABASE_USERNAME=root",
+        "-e",
+        "DATABASE_PASSWORD=root",
+        "-e",
+        "DATABASE_NAME=dtd",
    ]
    for v in volumes:
        cmd.extend(["-v", v])
    # Install zip (needed for --gtfs-zip) then run dtd2mysql
-    inner = "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql " + " ".join(args)
+    inner = (
+        "apt-get update -qq && apt-get install -y -qq zip > /dev/null 2>&1 && npx --yes dtd2mysql "
+        + " ".join(args)
+    )
    cmd.extend(["node:20", "bash", "-c", inner])
    subprocess.run(cmd, check=True)

@ -805,11 +838,17 @@ def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
        subprocess.run(["docker", "network", "create", network], capture_output=True)
        subprocess.run(
            [
-                "docker", "run", "-d",
-                "--name", db_container,
-                "--network", network,
-                "-e", "MARIADB_ROOT_PASSWORD=root",
-                "-e", "MARIADB_DATABASE=dtd",
+                "docker",
+                "run",
+                "-d",
+                "--name",
+                db_container,
+                "--network",
+                network,
+                "-e",
+                "MARIADB_ROOT_PASSWORD=root",
+                "-e",
+                "MARIADB_DATABASE=dtd",
                "mariadb:latest",
            ],
            check=True,
@ -820,7 +859,16 @@ def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:
            print("  Waiting for MariaDB to be ready...")
            for attempt in range(30):
                result = subprocess.run(
-                    ["docker", "exec", db_container, "mariadb", "-uroot", "-proot", "-e", "SELECT 1"],
+                    [
+                        "docker",
+                        "exec",
+                        db_container,
+                        "mariadb",
+                        "-uroot",
+                        "-proot",
+                        "-e",
+                        "SELECT 1",
+                    ],
                    capture_output=True,
                )
                if result.returncode == 0:
@ -833,14 +881,16 @@ def convert_national_rail_to_gtfs(raw_dir: Path, output_dir: Path) -> Path:

            print("Importing CIF timetable into MariaDB...")
            _docker_run_dtd2mysql(
-                network, db_container,
+                network,
+                db_container,
                volumes=[f"{raw_abs}:/data:ro"],
                args=["--timetable", "/data/national_rail_cif.zip"],
            )

            print("Exporting GTFS from MariaDB...")
            _docker_run_dtd2mysql(
-                network, db_container,
+                network,
+                db_container,
                volumes=[f"{raw_abs}:/output"],
                args=["--gtfs-zip", "/output/national_rail_gtfs_raw.zip"],
            )
--- a/pipeline/transform/merge.py
+++ b/pipeline/transform/merge.py
@ -94,11 +94,18 @@ def _build(

    # Remap terminated postcodes to nearest active successor
    postcode_mapping = build_postcode_mapping(arcgis_path)
-    wide = wide.join(
-        postcode_mapping.lazy(), left_on="postcode", right_on="old_postcode", how="left"
-    ).with_columns(
-        pl.coalesce("new_postcode", "postcode").alias("postcode"),
-    ).drop("new_postcode")
+    wide = (
+        wide.join(
+            postcode_mapping.lazy(),
+            left_on="postcode",
+            right_on="old_postcode",
+            how="left",
+        )
+        .with_columns(
+            pl.coalesce("new_postcode", "postcode").alias("postcode"),
+        )
+        .drop("new_postcode")
+    )

    arcgis = (
        pl.scan_parquet(arcgis_path)
@ -252,16 +259,18 @@ def _build(
        .otherwise(pl.col("pp_property_type"))
        # Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
        # collapse terrace sub-types, and fold rare types into "Other"
-        .replace({
-            "Flat": "Flats/Maisonettes",
-            "Maisonette": "Flats/Maisonettes",
-            "End-Terrace": "Terraced",
-            "Mid-Terrace": "Terraced",
-            "Enclosed End-Terrace": "Terraced",
-            "Enclosed Mid-Terrace": "Terraced",
-            "Bungalow": "Other",
-            "Park home": "Other",
-        })
+        .replace(
+            {
+                "Flat": "Flats/Maisonettes",
+                "Maisonette": "Flats/Maisonettes",
+                "End-Terrace": "Terraced",
+                "Mid-Terrace": "Terraced",
+                "Enclosed End-Terrace": "Terraced",
+                "Enclosed Mid-Terrace": "Terraced",
+                "Bungalow": "Other",
+                "Park home": "Other",
+            }
+        )
        .alias("property_type")
    )

@ -426,10 +435,16 @@ def main():
        help="Census 2021 population by LSOA parquet file",
    )
    parser.add_argument(
-        "--output-postcodes", type=Path, required=True, help="Output postcode parquet file path"
+        "--output-postcodes",
+        type=Path,
+        required=True,
+        help="Output postcode parquet file path",
    )
    parser.add_argument(
-        "--output-properties", type=Path, required=True, help="Output properties parquet file path"
+        "--output-properties",
+        type=Path,
+        required=True,
+        help="Output properties parquet file path",
    )
    args = parser.parse_args()

--- a/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
+++ b/pipeline/transform/postcode_boundaries/test_postcode_boundaries.py
@ -454,9 +454,7 @@ class TestFillHoles:
        hole1 = [(10, 10), (20, 10), (20, 20), (10, 20), (10, 10)]
        outer2 = [(60, 60), (110, 60), (110, 110), (60, 110), (60, 60)]
        hole2 = [(70, 70), (80, 70), (80, 80), (70, 80), (70, 70)]
-        mp = MultiPolygon(
-            [Polygon(outer1, [hole1]), Polygon(outer2, [hole2])]
-        )
+        mp = MultiPolygon([Polygon(outer1, [hole1]), Polygon(outer2, [hole2])])
        result = _fill_holes(mp)
        assert result.geom_type == "MultiPolygon"
        for p in result.geoms:
--- a/pipeline/transform/price_estimation/backtest.py
+++ b/pipeline/transform/price_estimation/backtest.py
@ -112,7 +112,9 @@ def predict(test: pl.DataFrame, index: pl.DataFrame) -> pl.DataFrame:


 def compute_metrics(actual: np.ndarray, predicted: np.ndarray) -> dict:
-    valid = np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
+    valid = (
+        np.isfinite(predicted) & np.isfinite(actual) & (actual > 0) & (predicted > 0)
+    )
    actual = actual[valid]
    predicted = predicted[valid]

@ -176,7 +178,10 @@ def main():
        "--input", type=Path, required=True, help="Path to properties.parquet"
    )
    parser.add_argument(
-        "--postcodes", type=Path, required=True, help="Path to postcode.parquet (for lat/lon)"
+        "--postcodes",
+        type=Path,
+        required=True,
+        help="Path to postcode.parquet (for lat/lon)",
    )
    parser.add_argument(
        "--output", type=Path, required=True, help="Output backtest_results.parquet"
@ -185,7 +190,9 @@ def main():

    # Build index from pre-test data only (temporal holdout)
    print(f"Building price index (pairs with year2 < {TEST_YEAR_MIN})...")
-    index = build_index(args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes)
+    index = build_index(
+        args.input, max_pair_year=TEST_YEAR_MIN, postcodes_path=args.postcodes
+    )
    print(
        f"\nHoldout index: {len(index):,} rows, {index['sector'].n_unique():,} sectors, "
        f"{index['type_group'].n_unique()} type groups"
@ -233,7 +240,9 @@ def main():
    knn_est = knn_psm * fa * temporal_adj

    n_knn = int((np.isfinite(knn_est) & (knn_est > 0)).sum())
-    print(f"  kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)")
+    print(
+        f"  kNN estimates: {n_knn:,} of {len(test):,} ({n_knn / len(test) * 100:.1f}%)"
+    )

    # Blend: (1-w)*index + w*kNN where both available
    index_est = test["predicted"].to_numpy().astype(np.float64)
--- a/pipeline/transform/price_estimation/estimate.py
+++ b/pipeline/transform/price_estimation/estimate.py
@ -107,9 +107,7 @@ def main():
        pl.when(has_price)
        .then(
            pl.col("Last known price").cast(pl.Float64)
-            * (
-                pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp")
-            )
+            * (pl.col("_log_index_current_interp") - pl.col("_log_index_sale_interp"))
            .clip(-MAX_LOG_ADJUSTMENT, MAX_LOG_ADJUSTMENT)
            .exp()
        )
--- a/pipeline/transform/price_estimation/index.py
+++ b/pipeline/transform/price_estimation/index.py
@ -105,9 +105,7 @@ def extract_pairs(input_path: Path, max_year2: int | None = None) -> pl.DataFram
            .alias("log_ratio"),
            (
                1.0
-                / (pl.col("frac_year2") - pl.col("frac_year1"))
-                .cast(pl.Float64)
-                .sqrt()
+                / (pl.col("frac_year2") - pl.col("frac_year1")).cast(pl.Float64).sqrt()
            ).alias("weight"),
        )
        .filter(pl.col("log_ratio").abs() <= OUTLIER_THRESHOLD)
@ -453,8 +451,12 @@ def main():
        description="Build improved repeat-sales price index"
    )
    parser.add_argument("--input", type=Path, required=True)
-    parser.add_argument("--postcodes", type=Path, required=True,
-                        help="Path to postcode.parquet (for lat/lon centroids)")
+    parser.add_argument(
+        "--postcodes",
+        type=Path,
+        required=True,
+        help="Path to postcode.parquet (for lat/lon centroids)",
+    )
    parser.add_argument("--output", type=Path, required=True)
    args = parser.parse_args()

--- a/pipeline/transform/price_estimation/knn.py
+++ b/pipeline/transform/price_estimation/knn.py
@ -43,48 +43,39 @@ def build_knn_pool(
    """
    print("Building kNN pool...")
    lf = pl.scan_parquet(source) if isinstance(source, Path) else source
-    query = (
-        lf
-        .select(
-            "Postcode",
-            "Property type",
-            "lat",
-            "lon",
-            "Total floor area (sqm)",
-            "Last known price",
-            "Date of last transaction",
-        )
-        .filter(
-            pl.col("lat").is_not_null(),
-            pl.col("lon").is_not_null(),
-            pl.col("Total floor area (sqm)").is_not_null(),
-            pl.col("Total floor area (sqm)") > 0,
-            pl.col("Last known price").is_not_null(),
-            pl.col("Last known price") > 0,
-            pl.col("Postcode").is_not_null(),
-            pl.col("Date of last transaction").is_not_null(),
-        )
+    query = lf.select(
+        "Postcode",
+        "Property type",
+        "lat",
+        "lon",
+        "Total floor area (sqm)",
+        "Last known price",
+        "Date of last transaction",
+    ).filter(
+        pl.col("lat").is_not_null(),
+        pl.col("lon").is_not_null(),
+        pl.col("Total floor area (sqm)").is_not_null(),
+        pl.col("Total floor area (sqm)") > 0,
+        pl.col("Last known price").is_not_null(),
+        pl.col("Last known price") > 0,
+        pl.col("Postcode").is_not_null(),
+        pl.col("Date of last transaction").is_not_null(),
    )
    if max_sale_year is not None:
        query = query.filter(
            pl.col("Date of last transaction").dt.year() < max_sale_year
        )

-    pool = (
-        query.with_columns(
-            sector_expr(),
-            type_group_expr(),
-            (
-                pl.col("Date of last transaction").dt.year().cast(pl.Float64)
-                + (
-                    pl.col("Date of last transaction").dt.month().cast(pl.Float64)
-                    - 1.0
-                )
-                / 12.0
-            ).alias("_sale_fy"),
-            pl.lit(ref_frac_year).alias("_ref_fy"),
-        ).collect()
-    )
+    pool = query.with_columns(
+        sector_expr(),
+        type_group_expr(),
+        (
+            pl.col("Date of last transaction").dt.year().cast(pl.Float64)
+            + (pl.col("Date of last transaction").dt.month().cast(pl.Float64) - 1.0)
+            / 12.0
+        ).alias("_sale_fy"),
+        pl.lit(ref_frac_year).alias("_ref_fy"),
+    ).collect()
    pool = pool.filter(pl.col("type_group").is_not_null())
    print(f"  {len(pool):,} pool properties with lat/lon, floor area, price")

--- a/pipeline/transform/transform_poi.py
+++ b/pipeline/transform/transform_poi.py
@ -1085,7 +1085,9 @@ def transform(
        if cat not in all_set:
            mapped_but_absent.append(cat)
    if mapped_but_absent:
-        print(f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}")
+        print(
+            f"CATEGORY_MAP categories not in data (skipped): {sorted(mapped_but_absent)}"
+        )

    # Drop unwanted categories
    lf = lf.filter(~pl.col("category").is_in(list(DROP_CATEGORIES)))
--- a/pipeline/utils/england_geometry.py
+++ b/pipeline/utils/england_geometry.py
@ -27,7 +27,9 @@ def load_england_polygon(geojson_path: Path) -> PreparedGeometry:
    return prep(geometry)


-def in_england_mask(geojson_path: Path, lats: np.ndarray, lngs: np.ndarray) -> np.ndarray:
+def in_england_mask(
+    geojson_path: Path, lats: np.ndarray, lngs: np.ndarray
+) -> np.ndarray:
    """Vectorized check: which (lat, lng) points are within England.

    Returns a boolean numpy array.
--- a/pipeline/utils/poi_counts.py
+++ b/pipeline/utils/poi_counts.py
@ -106,7 +106,9 @@ def count_pois_per_postcode(
            if nearby is None:
                continue

-            distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
+            distances = haversine_km(
+                poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
+            )

            within_mask = distances <= radius_km
            within_indices = nearby[within_mask]
@ -179,7 +181,9 @@ def min_distance_per_postcode(
            if nearby is None:
                continue

-            distances = haversine_km(poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i])
+            distances = haversine_km(
+                poi_lats[nearby], poi_lngs[nearby], pc_lats[i], pc_lons[i]
+            )

            for group, cat_mask in category_masks.items():
                group_mask = cat_mask[nearby]
--- a/pipeline/utils/postcode_mapping.py
+++ b/pipeline/utils/postcode_mapping.py
@ -15,26 +15,49 @@ def build_postcode_mapping(arcgis_path: Path) -> pl.DataFrame:
    """
    arcgis = pl.scan_parquet(arcgis_path).filter(pl.col("ctry") == "E92000001")

-    active = arcgis.filter(pl.col("doterm").is_null()).select("pcds", "oseast1m", "osnrth1m").collect()
-    terminated = arcgis.filter(pl.col("doterm").is_not_null()).select("pcds", "oseast1m", "osnrth1m").collect()
+    active = (
+        arcgis.filter(pl.col("doterm").is_null())
+        .select("pcds", "oseast1m", "osnrth1m")
+        .collect()
+    )
+    terminated = (
+        arcgis.filter(pl.col("doterm").is_not_null())
+        .select("pcds", "oseast1m", "osnrth1m")
+        .collect()
+    )

-    print(f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}")
+    print(
+        f"Active postcodes: {active.height}, terminated postcodes: {terminated.height}"
+    )

    if terminated.height == 0:
-        return pl.DataFrame({"old_postcode": pl.Series([], dtype=pl.Utf8), "new_postcode": pl.Series([], dtype=pl.Utf8)})
+        return pl.DataFrame(
+            {
+                "old_postcode": pl.Series([], dtype=pl.Utf8),
+                "new_postcode": pl.Series([], dtype=pl.Utf8),
+            }
+        )

-    active_coords = np.column_stack([active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()])
-    terminated_coords = np.column_stack([terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()])
+    active_coords = np.column_stack(
+        [active["oseast1m"].to_numpy(), active["osnrth1m"].to_numpy()]
+    )
+    terminated_coords = np.column_stack(
+        [terminated["oseast1m"].to_numpy(), terminated["osnrth1m"].to_numpy()]
+    )

    tree = cKDTree(active_coords)
    distances, indices = tree.query(terminated_coords)

    active_postcodes = active["pcds"]
-    mapping = pl.DataFrame({
-        "old_postcode": terminated["pcds"],
-        "new_postcode": active_postcodes.gather(indices),
-    })
+    mapping = pl.DataFrame(
+        {
+            "old_postcode": terminated["pcds"],
+            "new_postcode": active_postcodes.gather(indices),
+        }
+    )

-    print(f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m")
+    print(
+        f"Postcode mapping: max distance = {distances.max():.0f}m, median = {np.median(distances):.0f}m"
+    )

    return mapping
--- a/pipeline/utils/test_poi_counts.py
+++ b/pipeline/utils/test_poi_counts.py
@ -72,7 +72,9 @@ def test_no_pois_returns_zeros(postcodes):
            "category": pl.Series([], dtype=pl.String),
        }
    )
-    result = count_pois_per_postcode(postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0)
+    result = count_pois_per_postcode(
+        postcodes, empty_pois, groups=POI_GROUPS, radius_km=2.0
+    )

    for group in POI_GROUPS:
        col = f"{group}_2km"
@ -125,7 +127,9 @@ def test_min_distance_no_pois_returns_nan(postcodes):
            "category": pl.Series([], dtype=pl.String),
        }
    )
-    result = min_distance_per_postcode(postcodes, empty_pois, groups={"train_tube": ["Rail station"]})
+    result = min_distance_per_postcode(
+        postcodes, empty_pois, groups={"train_tube": ["Rail station"]}
+    )

    assert "train_tube_nearest_km" in result.columns
    assert all(np.isnan(v) for v in result["train_tube_nearest_km"].to_list())