LGTM

2026-05-14 08:09:19 +01:00 · 2026-05-14 08:09:19 +01:00 · a4103b0896
commit a4103b0896
parent a8165249a4
64 changed files with 5376 additions and 3832 deletions
--- a/pipeline/transform/crime.py
+++ b/pipeline/transform/crime.py
@ -5,6 +5,7 @@ from pathlib import Path
 import polars as pl

 STREET_CRIME_CSV_RE = re.compile(r"^\d{4}-\d{2}-.+-street\.csv$")
+MONTH_RE = r"^\d{4}-\d{2}$"


 def find_street_crime_csvs(crime_dir: Path) -> tuple[list[Path], int]:
@ -37,16 +38,45 @@ def transform_crime(crime_dir: Path, output_path: Path) -> None:
        },
    ).select("LSOA code", "Crime type", "Month")

-    # Extract year, count crimes per LSOA / year / crime type
+    valid_month_expr = pl.col("Month").str.contains(MONTH_RE)
+    valid_months = (
+        df.filter(valid_month_expr)
+        .select("Month")
+        .unique()
+        .collect(engine="streaming")["Month"]
+        .sort()
+        .to_list()
+    )
+    if not valid_months:
+        raise ValueError(f"No valid crime months found in {crime_dir}")
+
+    valid_month_count = len(valid_months)
+    print(
+        f"Using {valid_month_count} valid data months "
+        f"({valid_months[0]} to {valid_months[-1]})"
+    )
+
+    # Count monthly incidents, then annualise over every valid month in the dataset.
    yearly_counts = (
-        df.filter(pl.col("LSOA code").is_not_null() & (pl.col("LSOA code") != ""))
-        .with_columns(pl.col("Month").str.slice(0, 4).alias("year"))
-        .group_by("LSOA code", "year", "Crime type")
+        df.filter(
+            valid_month_expr
+            & pl.col("LSOA code").is_not_null()
+            & (pl.col("LSOA code") != "")
+            & pl.col("Crime type").is_not_null()
+            & (pl.col("Crime type") != "")
+        )
+        .group_by("LSOA code", "Month", "Crime type")
        .agg(pl.len().alias("count"))
        .group_by("LSOA code", "Crime type")
-        .agg(pl.col("count").mean().round(1).alias("yearly_avg"))
+        .agg(
+            (pl.col("count").sum() / pl.lit(valid_month_count) * 12)
+            .round(1)
+            .alias("yearly_avg")
+        )
        .collect(engine="streaming")
    )
+    if yearly_counts.is_empty():
+        raise ValueError(f"No valid crime rows found in {crime_dir}")

    print(f"Crime types: {sorted(yearly_counts['Crime type'].unique().to_list())}")

--- a/pipeline/transform/price_estimation/test_knn.py
+++ b/pipeline/transform/price_estimation/test_knn.py
@ -0,0 +1,94 @@
+from datetime import date
+
+import numpy as np
+import polars as pl
+
+from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
+from pipeline.transform.price_estimation.knn import build_knn_pool, knn_median_psm
+from pipeline.transform.price_estimation.utils import TYPE_GROUPS, type_group_expr
+
+
+def _flat_index() -> pl.DataFrame:
+    return pl.DataFrame(
+        {
+            "sector": ["AA1 1", "AA1 1"],
+            "type_group": ["Detached", "All"],
+            "year": [2026, 2026],
+            "log_index": [0.0, 0.0],
+        }
+    )
+
+
+def test_knn_excludes_same_sale_and_uses_stable_comparables():
+    sale_date = date(2026, 1, 1)
+    rows = [
+        {
+            "Postcode": "AA1 1AA",
+            "Property type": "Detached",
+            "lat": 51.5000,
+            "lon": -0.1000,
+            "Total floor area (sqm)": 80.0,
+            "Last known price": 900_000.0,
+            "Date of last transaction": sale_date,
+        }
+    ]
+    rows.extend(
+        {
+            "Postcode": "AA1 1AA",
+            "Property type": "Detached",
+            "lat": 51.5001 + i * 0.00001,
+            "lon": -0.1001,
+            "Total floor area (sqm)": 20.0,
+            "Last known price": 900_000.0,
+            "Date of last transaction": sale_date,
+        }
+        for i in range(5)
+    )
+    rows.extend(
+        {
+            "Postcode": f"AA1 1B{i}",
+            "Property type": "Detached",
+            "lat": 51.5010 + i * 0.00001,
+            "lon": -0.1010,
+            "Total floor area (sqm)": 80.0,
+            "Last known price": 200_000.0,
+            "Date of last transaction": sale_date,
+        }
+        for i in range(5)
+    )
+    df = pl.DataFrame(rows)
+
+    trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
+    psm = knn_median_psm(
+        trees,
+        lat=np.array([51.5000]),
+        lon=np.array([-0.1000]),
+        type_groups=np.array(["Detached"]),
+        postcodes=np.array(["AA1 1AA"]),
+        last_prices=np.array([900_000.0]),
+        last_sale_dates=np.array(
+            [sale_date.toordinal() - date(1970, 1, 1).toordinal()]
+        ),
+    )
+
+    assert psm[0] == 2_500.0
+
+
+def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
+    blended = guarded_blend_estimates(
+        index_est=np.array([120_000.0, 1_000_000.0]),
+        knn_est=np.array([5_000_000.0, 1_000_000.0]),
+        last_prices=np.array([100_000.0, 100_000.0]),
+    )
+
+    assert blended[0] == 120_000.0
+    assert blended[1] == 600_000.0
+
+
+def test_bungalow_is_not_a_dead_price_index_type_group():
+    df = pl.DataFrame({"Property type": ["Bungalow", "Other"]}).with_columns(
+        type_group_expr()
+    )
+
+    assert "Bungalow" not in TYPE_GROUPS
+    assert df["type_group"].to_list() == [None, None]
--- a/pipeline/transform/test_crime.py
+++ b/pipeline/transform/test_crime.py
@ -44,4 +44,76 @@ def test_transform_crime_reads_only_street_crime_csvs(tmp_path):

    result = pl.read_parquet(output).to_dicts()

-    assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 2.0}]
+    assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 24.0}]
+
+
+def test_transform_crime_annualises_over_all_valid_months(tmp_path):
+    crime_dir = tmp_path / "crime"
+    jan_dir = crime_dir / "2024-01"
+    feb_dir = crime_dir / "2024-02"
+    jan_dir.mkdir(parents=True)
+    feb_dir.mkdir(parents=True)
+
+    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
+    (jan_dir / "2024-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
+                "2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
+                "3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
+            ]
+        )
+        + "\n"
+    )
+    (feb_dir / "2024-02-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                header,
+                "4,2024-02,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
+            ]
+        )
+        + "\n"
+    )
+
+    output = tmp_path / "crime.parquet"
+    transform_crime(crime_dir, output)
+
+    result = pl.read_parquet(output).sort("LSOA code").to_dicts()
+
+    assert result == [
+        {
+            "LSOA code": "E01000001",
+            "Burglary (avg/yr)": 12.0,
+            "Robbery (avg/yr)": 0.0,
+        },
+        {
+            "LSOA code": "E01000002",
+            "Burglary (avg/yr)": 0.0,
+            "Robbery (avg/yr)": 12.0,
+        },
+    ]
+
+
+def test_transform_crime_fails_without_valid_months(tmp_path):
+    crime_dir = tmp_path / "crime"
+    month_dir = crime_dir / "2024-01"
+    month_dir.mkdir(parents=True)
+    (month_dir / "2024-01-test-force-street.csv").write_text(
+        "\n".join(
+            [
+                "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
+                "1,,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
+            ]
+        )
+        + "\n"
+    )
+
+    output = tmp_path / "crime.parquet"
+
+    try:
+        transform_crime(crime_dir, output)
+    except ValueError as exc:
+        assert "No valid crime months" in str(exc)
+    else:
+        raise AssertionError("Expected ValueError")
--- a/pipeline/transform/test_join_epc_pp.py
+++ b/pipeline/transform/test_join_epc_pp.py
@ -136,17 +136,17 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
    price_paid_path = tmp_path / "price-paid.parquet"
    pl.DataFrame(
        {
-            "price": [250_000],
-            "date_of_transfer": [date(2024, 2, 3)],
-            "property_type": ["T"],
-            "postcode": ["AA1 1AA"],
-            "paon": ["1"],
-            "saon": [None],
-            "street": ["Example Street"],
-            "locality": [None],
-            "town_city": ["Exampletown"],
-            "duration": ["F"],
-            "old_new": ["N"],
+            "price": [200_000, 250_000],
+            "date_of_transfer": [date(2020, 2, 3), date(2024, 2, 3)],
+            "property_type": ["T", "T"],
+            "postcode": ["AA1 1AA", "AA1 1AA"],
+            "paon": ["1", "1"],
+            "saon": [None, None],
+            "street": ["Example-Street", "Example Street"],
+            "locality": [None, None],
+            "town_city": ["Exampletown", "Exampletown"],
+            "duration": ["F", "F"],
+            "old_new": ["N", "N"],
        }
    ).write_parquet(price_paid_path)

@ -172,3 +172,85 @@ def test_run_joins_domestic_zip_with_price_paid(tmp_path: Path):
        }
    ]
    assert df.get_column("renovation_history").list.len().to_list() == [1]
+    assert df.get_column("historical_prices").list.len().to_list() == [2]
+
+
+def test_run_excludes_price_paid_rows_without_full_postcode(tmp_path: Path):
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row())
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000, 300_000],
+            "date_of_transfer": [date(2024, 2, 3), date(2024, 2, 4)],
+            "property_type": ["T", "T"],
+            "postcode": ["AA1 1AA", ""],
+            "paon": ["1", "2"],
+            "saon": [None, None],
+            "street": ["Example Street", "Example Street"],
+            "locality": [None, None],
+            "town_city": ["Exampletown", "Exampletown"],
+            "duration": ["F", "F"],
+            "old_new": ["N", "N"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df["postcode"].to_list() == ["AA1 1AA"]
+
+
+def test_run_does_not_attach_epc_facts_to_low_score_address_match(tmp_path: Path):
+    zip_path = tmp_path / "domestic-csv.zip"
+    with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
+        csv_buffer = io.StringIO()
+        writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
+        writer.writeheader()
+        writer.writerow(_row(address="1 Totally Different Road"))
+        archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
+
+    price_paid_path = tmp_path / "price-paid.parquet"
+    pl.DataFrame(
+        {
+            "price": [250_000],
+            "date_of_transfer": [date(2024, 2, 3)],
+            "property_type": ["T"],
+            "postcode": ["AA1 1AA"],
+            "paon": ["1"],
+            "saon": [None],
+            "street": ["Example Street"],
+            "locality": [None],
+            "town_city": ["Exampletown"],
+            "duration": ["F"],
+            "old_new": ["N"],
+        }
+    ).write_parquet(price_paid_path)
+
+    output_path = tmp_path / "epc-pp.parquet"
+    _run(zip_path, price_paid_path, output_path, tmp_path)
+
+    df = pl.read_parquet(output_path)
+
+    assert df.height == 1
+    assert df.select(
+        "pp_address",
+        "epc_address",
+        "total_floor_area",
+        "current_energy_rating",
+    ).to_dicts() == [
+        {
+            "pp_address": "1 Example Street",
+            "epc_address": None,
+            "total_floor_area": None,
+            "current_energy_rating": None,
+        }
+    ]
--- a/pipeline/transform/test_merge.py
+++ b/pipeline/transform/test_merge.py
@ -1,8 +1,14 @@
 import polars as pl
+import pytest

 from pipeline.transform.merge import (
+    _AREA_COLUMNS,
+    TREE_DENSITY_FEATURE,
    _is_dynamic_poi_metric_column,
    _less_deprived_percentile_expr,
+    _tree_density_by_postcode,
+    _validate_lad_source_coverage,
+    _validate_property_postcodes,
 )


@ -36,3 +42,103 @@ def test_dynamic_poi_metric_columns_are_area_level() -> None:
    assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
    assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
    assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
+
+
+def test_country_code_is_kept_in_postcode_area_columns() -> None:
+    assert "ctry25cd" in _AREA_COLUMNS
+
+
+def test_validate_property_postcodes_rejects_blank_rows() -> None:
+    df = pl.DataFrame(
+        {
+            "Postcode": ["AA1 1AA", ""],
+            "Address per Property Register": ["1 Example Street", "2 Example Street"],
+            "Last known price": [100_000, 200_000],
+        }
+    )
+
+    with pytest.raises(ValueError, match="Property rows missing a postcode"):
+        _validate_property_postcodes(df)
+
+
+def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
+    tmp_path,
+) -> None:
+    iod_path = tmp_path / "iod.parquet"
+    ethnicity_path = tmp_path / "ethnicity.parquet"
+    rental_path = tmp_path / "rental.parquet"
+    pl.DataFrame(
+        {
+            "Local Authority District code (2024)": [
+                "E08000016",
+                "E06000053",
+                "E09000001",
+            ],
+            "Local Authority District name (2024)": [
+                "Barnsley",
+                "Isles of Scilly",
+                "City of London",
+            ],
+        }
+    ).write_parquet(iod_path)
+    pl.DataFrame(
+        {"Geography_code": ["E08000016", "E06000053", "E09000001"]}
+    ).write_parquet(ethnicity_path)
+    pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
+        rental_path
+    )
+
+    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
+
+
+def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
+    iod_path = tmp_path / "iod.parquet"
+    ethnicity_path = tmp_path / "ethnicity.parquet"
+    rental_path = tmp_path / "rental.parquet"
+    pl.DataFrame(
+        {
+            "Local Authority District code (2024)": ["E08000016"],
+            "Local Authority District name (2024)": ["Barnsley"],
+        }
+    ).write_parquet(iod_path)
+    pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
+    pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
+        rental_path
+    )
+
+    with pytest.raises(ValueError, match="Rental data is missing"):
+        _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
+
+
+def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
+    path = tmp_path / "tree_density_by_postcode.parquet"
+    pl.DataFrame(
+        {
+            "postcode": ["AB1 2CD", "EF3 4GH"],
+            "Tree canopy density percentile within 50m": [12.5, 99.0],
+        }
+    ).write_parquet(path)
+
+    result = _tree_density_by_postcode(path).collect().sort("postcode")
+
+    assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
+    assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
+    assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
+
+
+def test_tree_density_by_postcode_requires_postcode_and_density_columns(
+    tmp_path,
+) -> None:
+    path = tmp_path / "tree_density_by_postcode.parquet"
+    pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
+
+    with pytest.raises(ValueError, match="must contain column"):
+        _tree_density_by_postcode(path)
+
+    missing_postcode_path = tmp_path / "missing_postcode.parquet"
+    pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
+        missing_postcode_path
+    )
+
+    with pytest.raises(ValueError, match="missing required column: postcode"):
+        _tree_density_by_postcode(missing_postcode_path)