import polars as pl import pyarrow as pa import pytest from shapely import box, to_wkb from pipeline.transform.merge import ( _AREA_COLUMNS, CONSERVATION_AREA_FEATURE, LISTED_BUILDING_FEATURE, TREE_DENSITY_FEATURE, _is_unpublished_conservation_area_record, _is_dynamic_poi_metric_column, _less_deprived_percentile_expr, _load_conservation_area_geometries, _matched_listed_building_flags, _postcode_conservation_area_flags, _postcode_listed_building_candidates, _tree_density_by_postcode, _validate_lad_source_coverage, _validate_property_postcodes, ) def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None: df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]}) result = ( df.lazy() .with_columns(_less_deprived_percentile_expr("Income Score (rate)")) .collect() ) assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None] def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None: df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]}) result = ( df.lazy() .with_columns(_less_deprived_percentile_expr("Income Score (rate)")) .collect() ) assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0] def test_dynamic_poi_metric_columns_are_area_level() -> None: assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)") assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)") assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km") assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km") assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km") def test_country_code_is_kept_in_postcode_area_columns() -> None: assert "ctry25cd" in _AREA_COLUMNS def test_conservation_area_feature_is_area_level() -> None: assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS def test_listed_building_feature_is_property_level() -> None: assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS def test_postcode_conservation_area_flags_marks_point_membership() -> None: postcodes = pl.DataFrame( { "postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"], "lat": [0.5, 2.0, None], "lon": [0.5, 2.0, 0.5], } ) result = _postcode_conservation_area_flags( postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2 ).sort("postcode") assert result.to_dicts() == [ {"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"}, {"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"}, {"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"}, ] def test_unpublished_conservation_area_records_are_identified() -> None: assert _is_unpublished_conservation_area_record( "No data available for publication by HE" ) assert not _is_unpublished_conservation_area_record("Bloomsbury") assert not _is_unpublished_conservation_area_record(None) def test_load_conservation_area_geometries_skips_unpublished_placeholders( monkeypatch: pytest.MonkeyPatch, tmp_path, ) -> None: real_area = box(0, 0, 1, 1) placeholder_area = box(-100, -100, 100, 100) def fake_read_arrow(path, columns): assert path == tmp_path / "conservation_areas.gpkg" assert columns == ["NAME"] table = pa.table( { "NAME": [ "Central Village", "No data available for publication by HE", ], "SHAPE": to_wkb([real_area, placeholder_area]), } ) return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow) geometries, crs = _load_conservation_area_geometries( tmp_path / "conservation_areas.gpkg" ) assert crs == "EPSG:4326" assert geometries == [real_area] def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None: listed_points = pl.DataFrame( { "ListEntry": [1234, 5678], "Name": ["1 and 2 High Street", "Distant Hall"], "Grade": ["II", "I"], "Easting": [100.0, 1000.0], "Northing": [100.0, 1000.0], } ).with_columns( pl.col("Name") .str.to_uppercase() .str.replace_all(r"[^0-9A-Z]+", " ") .str.replace_all(r"\s+", " ") .str.strip_chars() .alias("_listed_match_name") ) active_postcodes = pl.DataFrame( { "postcode": ["AA1 1AA", "BB1 1BB"], "east1m": [105.0, 5000.0], "north1m": [105.0, 5000.0], } ) result = _postcode_listed_building_candidates( listed_points, active_postcodes, nearest_postcodes=1, max_distance_m=25, ) assert result.select("postcode", "_listed_match_name").to_dicts() == [ {"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"} ] def test_matched_listed_building_flags_requires_address_match() -> None: properties = pl.DataFrame( { "postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"], "pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"], "epc_address": ["1, High Street", "99, High Street", "Old Rectory"], } ) listed_candidates = pl.DataFrame( { "postcode": ["AA1 1AA", "BB1 1BB"], "_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"], "_listed_grade": ["II", "II*"], "_listed_entry": [1234, 5678], } ) result = _matched_listed_building_flags( properties.lazy(), listed_candidates, min_score=95 ).sort("postcode", "pp_address") assert result.to_dicts() == [ { "postcode": "AA1 1AA", "pp_address": "1 HIGH STREET", LISTED_BUILDING_FEATURE: "Yes", }, { "postcode": "BB1 1BB", "pp_address": "THE OLD RECTORY", LISTED_BUILDING_FEATURE: "Yes", }, ] def test_validate_property_postcodes_rejects_blank_rows() -> None: df = pl.DataFrame( { "Postcode": ["AA1 1AA", ""], "Address per Property Register": ["1 Example Street", "2 Example Street"], "Last known price": [100_000, 200_000], } ) with pytest.raises(ValueError, match="Property rows missing a postcode"): _validate_property_postcodes(df) def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads( tmp_path, ) -> None: iod_path = tmp_path / "iod.parquet" ethnicity_path = tmp_path / "ethnicity.parquet" rental_path = tmp_path / "rental.parquet" pl.DataFrame( { "Local Authority District code (2024)": [ "E08000016", "E06000053", "E09000001", ], "Local Authority District name (2024)": [ "Barnsley", "Isles of Scilly", "City of London", ], } ).write_parquet(iod_path) pl.DataFrame( {"Geography_code": ["E08000016", "E06000053", "E09000001"]} ).write_parquet(ethnicity_path) pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet( rental_path ) _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path) def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None: iod_path = tmp_path / "iod.parquet" ethnicity_path = tmp_path / "ethnicity.parquet" rental_path = tmp_path / "rental.parquet" pl.DataFrame( { "Local Authority District code (2024)": ["E08000016"], "Local Authority District name (2024)": ["Barnsley"], } ).write_parquet(iod_path) pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path) pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet( rental_path ) with pytest.raises(ValueError, match="Rental data is missing"): _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path) def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None: path = tmp_path / "tree_density_by_postcode.parquet" pl.DataFrame( { "postcode": ["AB1 2CD", "EF3 4GH"], "Tree canopy density percentile within 50m": [12.5, 99.0], } ).write_parquet(path) result = _tree_density_by_postcode(path).collect().sort("postcode") assert result.columns == ["postcode", TREE_DENSITY_FEATURE] assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0] assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32 def test_tree_density_by_postcode_requires_postcode_and_density_columns( tmp_path, ) -> None: path = tmp_path / "tree_density_by_postcode.parquet" pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path) with pytest.raises(ValueError, match="must contain column"): _tree_density_by_postcode(path) missing_postcode_path = tmp_path / "missing_postcode.parquet" pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet( missing_postcode_path ) with pytest.raises(ValueError, match="missing required column: postcode"): _tree_density_by_postcode(missing_postcode_path)