LGTM
This commit is contained in:
parent
a8165249a4
commit
a4103b0896
64 changed files with 5376 additions and 3832 deletions
|
|
@ -1,8 +1,14 @@
|
|||
import polars as pl
|
||||
import pytest
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_AREA_COLUMNS,
|
||||
TREE_DENSITY_FEATURE,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
_tree_density_by_postcode,
|
||||
_validate_lad_source_coverage,
|
||||
_validate_property_postcodes,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -36,3 +42,103 @@ def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
|||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
|
||||
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
|
||||
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
||||
|
||||
|
||||
def test_country_code_is_kept_in_postcode_area_columns() -> None:
|
||||
assert "ctry25cd" in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_validate_property_postcodes_rejects_blank_rows() -> None:
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", ""],
|
||||
"Address per Property Register": ["1 Example Street", "2 Example Street"],
|
||||
"Last known price": [100_000, 200_000],
|
||||
}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Property rows missing a postcode"):
|
||||
_validate_property_postcodes(df)
|
||||
|
||||
|
||||
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Local Authority District code (2024)": [
|
||||
"E08000016",
|
||||
"E06000053",
|
||||
"E09000001",
|
||||
],
|
||||
"Local Authority District name (2024)": [
|
||||
"Barnsley",
|
||||
"Isles of Scilly",
|
||||
"City of London",
|
||||
],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame(
|
||||
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
|
||||
).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
|
||||
|
||||
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
|
||||
iod_path = tmp_path / "iod.parquet"
|
||||
ethnicity_path = tmp_path / "ethnicity.parquet"
|
||||
rental_path = tmp_path / "rental.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Local Authority District code (2024)": ["E08000016"],
|
||||
"Local Authority District name (2024)": ["Barnsley"],
|
||||
}
|
||||
).write_parquet(iod_path)
|
||||
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
|
||||
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
|
||||
rental_path
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="Rental data is missing"):
|
||||
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
||||
|
||||
|
||||
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
|
||||
path = tmp_path / "tree_density_by_postcode.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"postcode": ["AB1 2CD", "EF3 4GH"],
|
||||
"Tree canopy density percentile within 50m": [12.5, 99.0],
|
||||
}
|
||||
).write_parquet(path)
|
||||
|
||||
result = _tree_density_by_postcode(path).collect().sort("postcode")
|
||||
|
||||
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
|
||||
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
|
||||
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
|
||||
|
||||
|
||||
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
path = tmp_path / "tree_density_by_postcode.parquet"
|
||||
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
|
||||
|
||||
with pytest.raises(ValueError, match="must contain column"):
|
||||
_tree_density_by_postcode(path)
|
||||
|
||||
missing_postcode_path = tmp_path / "missing_postcode.parquet"
|
||||
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
|
||||
missing_postcode_path
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match="missing required column: postcode"):
|
||||
_tree_density_by_postcode(missing_postcode_path)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue