250 lines
8.1 KiB
Python
250 lines
8.1 KiB
Python
import polars as pl
|
|
import pytest
|
|
from shapely import box
|
|
|
|
from pipeline.transform.merge import (
|
|
_AREA_COLUMNS,
|
|
CONSERVATION_AREA_FEATURE,
|
|
LISTED_BUILDING_FEATURE,
|
|
TREE_DENSITY_FEATURE,
|
|
_is_dynamic_poi_metric_column,
|
|
_less_deprived_percentile_expr,
|
|
_matched_listed_building_flags,
|
|
_postcode_conservation_area_flags,
|
|
_postcode_listed_building_candidates,
|
|
_tree_density_by_postcode,
|
|
_validate_lad_source_coverage,
|
|
_validate_property_postcodes,
|
|
)
|
|
|
|
|
|
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
|
|
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
|
|
|
|
result = (
|
|
df.lazy()
|
|
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
|
|
.collect()
|
|
)
|
|
|
|
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
|
|
|
|
|
|
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
|
|
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
|
|
|
|
result = (
|
|
df.lazy()
|
|
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
|
|
.collect()
|
|
)
|
|
|
|
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
|
|
|
|
|
|
def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
|
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
|
|
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
|
|
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
|
|
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
|
|
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
|
|
|
|
|
def test_country_code_is_kept_in_postcode_area_columns() -> None:
|
|
assert "ctry25cd" in _AREA_COLUMNS
|
|
|
|
|
|
def test_conservation_area_feature_is_area_level() -> None:
|
|
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
|
|
|
|
|
def test_listed_building_feature_is_property_level() -> None:
|
|
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
|
|
|
|
|
def test_postcode_conservation_area_flags_marks_point_membership() -> None:
|
|
postcodes = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
|
|
"lat": [0.5, 2.0, None],
|
|
"lon": [0.5, 2.0, 0.5],
|
|
}
|
|
)
|
|
|
|
result = _postcode_conservation_area_flags(
|
|
postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
|
|
).sort("postcode")
|
|
|
|
assert result.to_dicts() == [
|
|
{"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
|
|
{"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
|
|
{"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
|
|
]
|
|
|
|
|
|
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
|
|
listed_points = pl.DataFrame(
|
|
{
|
|
"ListEntry": [1234, 5678],
|
|
"Name": ["1 and 2 High Street", "Distant Hall"],
|
|
"Grade": ["II", "I"],
|
|
"Easting": [100.0, 1000.0],
|
|
"Northing": [100.0, 1000.0],
|
|
}
|
|
).with_columns(
|
|
pl.col("Name")
|
|
.str.to_uppercase()
|
|
.str.replace_all(r"[^0-9A-Z]+", " ")
|
|
.str.replace_all(r"\s+", " ")
|
|
.str.strip_chars()
|
|
.alias("_listed_match_name")
|
|
)
|
|
active_postcodes = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "BB1 1BB"],
|
|
"east1m": [105.0, 5000.0],
|
|
"north1m": [105.0, 5000.0],
|
|
}
|
|
)
|
|
|
|
result = _postcode_listed_building_candidates(
|
|
listed_points,
|
|
active_postcodes,
|
|
nearest_postcodes=1,
|
|
max_distance_m=25,
|
|
)
|
|
|
|
assert result.select("postcode", "_listed_match_name").to_dicts() == [
|
|
{"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
|
|
]
|
|
|
|
|
|
def test_matched_listed_building_flags_requires_address_match() -> None:
|
|
properties = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
|
|
"pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
|
|
"epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
|
|
}
|
|
)
|
|
listed_candidates = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "BB1 1BB"],
|
|
"_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
|
|
"_listed_grade": ["II", "II*"],
|
|
"_listed_entry": [1234, 5678],
|
|
}
|
|
)
|
|
|
|
result = _matched_listed_building_flags(
|
|
properties.lazy(), listed_candidates, min_score=95
|
|
).sort("postcode", "pp_address")
|
|
|
|
assert result.to_dicts() == [
|
|
{
|
|
"postcode": "AA1 1AA",
|
|
"pp_address": "1 HIGH STREET",
|
|
LISTED_BUILDING_FEATURE: "Yes",
|
|
},
|
|
{
|
|
"postcode": "BB1 1BB",
|
|
"pp_address": "THE OLD RECTORY",
|
|
LISTED_BUILDING_FEATURE: "Yes",
|
|
},
|
|
]
|
|
|
|
|
|
def test_validate_property_postcodes_rejects_blank_rows() -> None:
|
|
df = pl.DataFrame(
|
|
{
|
|
"Postcode": ["AA1 1AA", ""],
|
|
"Address per Property Register": ["1 Example Street", "2 Example Street"],
|
|
"Last known price": [100_000, 200_000],
|
|
}
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="Property rows missing a postcode"):
|
|
_validate_property_postcodes(df)
|
|
|
|
|
|
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
|
tmp_path,
|
|
) -> None:
|
|
iod_path = tmp_path / "iod.parquet"
|
|
ethnicity_path = tmp_path / "ethnicity.parquet"
|
|
rental_path = tmp_path / "rental.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"Local Authority District code (2024)": [
|
|
"E08000016",
|
|
"E06000053",
|
|
"E09000001",
|
|
],
|
|
"Local Authority District name (2024)": [
|
|
"Barnsley",
|
|
"Isles of Scilly",
|
|
"City of London",
|
|
],
|
|
}
|
|
).write_parquet(iod_path)
|
|
pl.DataFrame(
|
|
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
|
|
).write_parquet(ethnicity_path)
|
|
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
|
|
rental_path
|
|
)
|
|
|
|
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
|
|
|
|
|
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
|
|
iod_path = tmp_path / "iod.parquet"
|
|
ethnicity_path = tmp_path / "ethnicity.parquet"
|
|
rental_path = tmp_path / "rental.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"Local Authority District code (2024)": ["E08000016"],
|
|
"Local Authority District name (2024)": ["Barnsley"],
|
|
}
|
|
).write_parquet(iod_path)
|
|
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
|
|
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
|
|
rental_path
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="Rental data is missing"):
|
|
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
|
|
|
|
|
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
|
|
path = tmp_path / "tree_density_by_postcode.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"postcode": ["AB1 2CD", "EF3 4GH"],
|
|
"Tree canopy density percentile within 50m": [12.5, 99.0],
|
|
}
|
|
).write_parquet(path)
|
|
|
|
result = _tree_density_by_postcode(path).collect().sort("postcode")
|
|
|
|
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
|
|
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
|
|
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
|
|
|
|
|
|
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
|
|
tmp_path,
|
|
) -> None:
|
|
path = tmp_path / "tree_density_by_postcode.parquet"
|
|
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
|
|
|
|
with pytest.raises(ValueError, match="must contain column"):
|
|
_tree_density_by_postcode(path)
|
|
|
|
missing_postcode_path = tmp_path / "missing_postcode.parquet"
|
|
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
|
|
missing_postcode_path
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="missing required column: postcode"):
|
|
_tree_density_by_postcode(missing_postcode_path)
|