import polars as pl import pytest from pipeline.transform.merge import ( _AREA_COLUMNS, TREE_DENSITY_FEATURE, _is_dynamic_poi_metric_column, _less_deprived_percentile_expr, _tree_density_by_postcode, _validate_lad_source_coverage, _validate_property_postcodes, ) def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None: df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]}) result = ( df.lazy() .with_columns(_less_deprived_percentile_expr("Income Score (rate)")) .collect() ) assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None] def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None: df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]}) result = ( df.lazy() .with_columns(_less_deprived_percentile_expr("Income Score (rate)")) .collect() ) assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0] def test_dynamic_poi_metric_columns_are_area_level() -> None: assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)") assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)") assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km") assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km") assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km") def test_country_code_is_kept_in_postcode_area_columns() -> None: assert "ctry25cd" in _AREA_COLUMNS def test_validate_property_postcodes_rejects_blank_rows() -> None: df = pl.DataFrame( { "Postcode": ["AA1 1AA", ""], "Address per Property Register": ["1 Example Street", "2 Example Street"], "Last known price": [100_000, 200_000], } ) with pytest.raises(ValueError, match="Property rows missing a postcode"): _validate_property_postcodes(df) def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads( tmp_path, ) -> None: iod_path = tmp_path / "iod.parquet" ethnicity_path = tmp_path / "ethnicity.parquet" rental_path = tmp_path / "rental.parquet" pl.DataFrame( { "Local Authority District code (2024)": [ "E08000016", "E06000053", "E09000001", ], "Local Authority District name (2024)": [ "Barnsley", "Isles of Scilly", "City of London", ], } ).write_parquet(iod_path) pl.DataFrame( {"Geography_code": ["E08000016", "E06000053", "E09000001"]} ).write_parquet(ethnicity_path) pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet( rental_path ) _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path) def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None: iod_path = tmp_path / "iod.parquet" ethnicity_path = tmp_path / "ethnicity.parquet" rental_path = tmp_path / "rental.parquet" pl.DataFrame( { "Local Authority District code (2024)": ["E08000016"], "Local Authority District name (2024)": ["Barnsley"], } ).write_parquet(iod_path) pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path) pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet( rental_path ) with pytest.raises(ValueError, match="Rental data is missing"): _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path) def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None: path = tmp_path / "tree_density_by_postcode.parquet" pl.DataFrame( { "postcode": ["AB1 2CD", "EF3 4GH"], "Tree canopy density percentile within 50m": [12.5, 99.0], } ).write_parquet(path) result = _tree_density_by_postcode(path).collect().sort("postcode") assert result.columns == ["postcode", TREE_DENSITY_FEATURE] assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0] assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32 def test_tree_density_by_postcode_requires_postcode_and_density_columns( tmp_path, ) -> None: path = tmp_path / "tree_density_by_postcode.parquet" pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path) with pytest.raises(ValueError, match="must contain column"): _tree_density_by_postcode(path) missing_postcode_path = tmp_path / "missing_postcode.parquet" pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet( missing_postcode_path ) with pytest.raises(ValueError, match="missing required column: postcode"): _tree_density_by_postcode(missing_postcode_path)