perfect-postcode/pipeline/transform/test_merge.py
2026-06-02 13:46:18 +01:00

1214 lines
46 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import polars as pl
import pyarrow as pa
import pytest
from shapely import box, to_wkb
from shapely.geometry import Point
from pipeline.transform.merge import (
_AREA_COLUMNS,
CONSERVATION_AREA_FEATURE,
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_LISTING_OVERLAY_SOURCES,
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_coalesce_direct_epc_columns,
_filter_to_active_english_postcodes,
_join_area_side_tables,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
_match_listing_properties,
_normalize_uprn,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_load_conservation_area_geometries,
_load_listings_for_merge,
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
_remap_terminated_postcodes,
_split_normal_outputs,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_postcode_feature_output,
_validate_property_postcodes,
)
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
result = (
df.lazy()
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
.collect()
)
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
result = (
df.lazy()
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
.collect()
)
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
def test_country_code_is_kept_in_postcode_area_columns() -> None:
assert "ctry25cd" in _AREA_COLUMNS
def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
# Crime is now a raw spatial count per postcode; the per-1k-residents
# variants were dropped along with the LSOA population denominator.
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
def test_active_english_postcode_area_filters_to_active_england() -> None:
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, "2020-01-01", None],
"lat": [51.0, 51.1, 52.0],
"long": [-0.1, -0.2, -3.0],
"lsoa21cd": ["L1", "L2", "L3"],
"oa21cd": ["O1", "O2", "O3"],
"pcon24cd": ["P1", "P2", "P3"],
}
)
result = _active_english_postcode_area(arcgis.lazy()).collect()
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"lat": 51.0,
"lon": -0.1,
"ctry25cd": "E92000001",
"lsoa21": "L1",
"oa21": "O1",
"pcon": "P1",
}
]
def test_remap_then_active_filter_keeps_terminated_english_properties() -> None:
wide = pl.DataFrame(
{
"postcode": ["OLD 1AA", "NEW 1AA", "CF1 1AA"],
"row_id": [1, 2, 3],
}
).lazy()
mapping = pl.DataFrame(
{"old_postcode": ["OLD 1AA"], "new_postcode": ["NEW 1AA"]}
).lazy()
active_postcodes = pl.DataFrame({"postcode": ["NEW 1AA"]}).lazy()
result = (
_filter_to_active_english_postcodes(
_remap_terminated_postcodes(wide, mapping), active_postcodes
)
.collect()
.sort("row_id")
)
assert result.to_dicts() == [
{"postcode": "NEW 1AA", "row_id": 1},
{"postcode": "NEW 1AA", "row_id": 2},
]
def test_split_normal_outputs_uses_postcode_feature_universe() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Address per Property Register": ["1 Example Road"],
"Last known price": [250_000],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
"lsoa21": ["L1"],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
"lsoa21": ["L1", "L2"],
"Distance to nearest amenity (Park) (km)": [0.3, 0.8],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert postcode_df["Postcode"].to_list() == ["AA1 1AA", "BB1 1BB"]
assert "Distance to nearest amenity (Park) (km)" in postcode_df.columns
assert properties_df.to_dicts() == [
{
"Postcode": "AA1 1AA",
"Address per Property Register": "1 Example Road",
"Last known price": 250_000,
}
]
def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() -> None:
postcode_df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "CF1 1AA"],
"lat": [51.0, None],
"lon": [-0.1, None],
"ctry25cd": ["E92000001", "W92000004"],
}
)
with pytest.raises(ValueError, match="unsupported or ungeocoded"):
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
def test_postcode_conservation_area_flags_marks_point_membership() -> None:
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
"lat": [0.5, 2.0, None],
"lon": [0.5, 2.0, 0.5],
}
)
result = _postcode_conservation_area_flags(
postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
).sort("postcode")
assert result.to_dicts() == [
{"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
{"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
{"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
]
def test_load_conservation_area_geometries_uses_current_planning_data_records(
monkeypatch: pytest.MonkeyPatch,
tmp_path,
) -> None:
real_area = box(0, 0, 1, 1)
ended_area = box(2, 2, 3, 3)
other_dataset_area = box(4, 4, 5, 5)
point = Point(0.5, 0.5)
def fake_read_arrow(path):
assert path == tmp_path / "conservation_areas.geojson"
table = pa.table(
{
"dataset": [
"conservation-area",
"conservation-area",
"listed-building",
"conservation-area",
],
"end-date": ["", "2025-01-01", "", ""],
"name": ["Central Village", "Old Boundary", "Other", "Point Record"],
"SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]),
}
)
return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
geometries, crs = _load_conservation_area_geometries(
tmp_path / "conservation_areas.geojson"
)
assert crs == "EPSG:4326"
assert geometries == [real_area]
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
listed_points = pl.DataFrame(
{
"ListEntry": [1234, 5678],
"Name": ["1 and 2 High Street", "Distant Hall"],
"Grade": ["II", "I"],
"Easting": [100.0, 1000.0],
"Northing": [100.0, 1000.0],
}
).with_columns(
pl.col("Name")
.str.to_uppercase()
.str.replace_all(r"[^0-9A-Z]+", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
.alias("_listed_match_name")
)
active_postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB"],
"east1m": [105.0, 5000.0],
"north1m": [105.0, 5000.0],
}
)
result = _postcode_listed_building_candidates(
listed_points,
active_postcodes,
nearest_postcodes=1,
max_distance_m=25,
)
assert result.select("postcode", "_listed_match_name").to_dicts() == [
{"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
]
def test_matched_listed_building_flags_requires_address_match() -> None:
properties = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
"pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
"epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
}
)
listed_candidates = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB"],
"_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
"_listed_grade": ["II", "II*"],
"_listed_entry": [1234, 5678],
}
)
result = _matched_listed_building_flags(
properties.lazy(), listed_candidates, min_score=95
).sort("postcode", "pp_address")
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"pp_address": "1 HIGH STREET",
LISTED_BUILDING_FEATURE: "Yes",
},
{
"postcode": "BB1 1BB",
"pp_address": "THE OLD RECTORY",
LISTED_BUILDING_FEATURE: "Yes",
},
]
def test_validate_property_postcodes_rejects_blank_rows() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", ""],
"Address per Property Register": ["1 Example Street", "2 Example Street"],
"Last known price": [100_000, 200_000],
}
)
with pytest.raises(ValueError, match="Property rows missing a postcode"):
_validate_property_postcodes(df)
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
"Local Authority District code (2024)": [
"E08000016",
"E06000053",
"E09000001",
],
"Local Authority District name (2024)": [
"Barnsley",
"Isles of Scilly",
"City of London",
],
}
).write_parquet(iod_path)
pl.DataFrame(
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
rental_path
)
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
"Local Authority District code (2024)": ["E08000016"],
"Local Authority District name (2024)": ["Barnsley"],
}
).write_parquet(iod_path)
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
rental_path
)
with pytest.raises(ValueError, match="Rental data is missing"):
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
path = tmp_path / "tree_density_by_postcode.parquet"
pl.DataFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"Tree canopy density percentile within 50m": [12.5, 99.0],
}
).write_parquet(path)
result = _tree_density_by_postcode(path).collect().sort("postcode")
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
tmp_path,
) -> None:
path = tmp_path / "tree_density_by_postcode.parquet"
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
with pytest.raises(ValueError, match="must contain column"):
_tree_density_by_postcode(path)
missing_postcode_path = tmp_path / "missing_postcode.parquet"
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
missing_postcode_path
)
with pytest.raises(ValueError, match="missing required column: postcode"):
_tree_density_by_postcode(missing_postcode_path)
def _sample_listings_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"Bedrooms": [3],
"Bathrooms": [2],
"Number of bedrooms & living rooms": [4],
"lon": [-0.1],
"lat": [51.5],
"Postcode": ["sw1a1aa"],
"Address per Property Register": ["1 Example Road"],
"Leasehold/Freehold": ["Freehold"],
"Property type": ["Terraced"],
"Property sub-type": ["Mid-Terrace"],
"Price qualifier": [""],
"Total floor area (sqm)": [120.0],
"Listing URL": ["https://example.test/abc"],
"Listing features": [["Garden", "Off-street parking"]],
"Listing date": [None],
"Listing status": ["For sale"],
"Asking price": [750_000],
"Asking price per sqm": [6_250],
},
schema={
"Bedrooms": pl.Int32,
"Bathrooms": pl.Int32,
"Number of bedrooms & living rooms": pl.Int32,
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,
"Price qualifier": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Listing URL": pl.Utf8,
"Listing features": pl.List(pl.Utf8),
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
"Asking price per sqm": pl.Int32,
},
)
def _stub_arcgis(path) -> None:
pl.DataFrame(
{
"pcds": ["SW1A 1AA"],
"ctry25cd": ["E92000001"],
"doterm": [None],
"east1m": [530000.0],
"north1m": [180000.0],
},
schema={
"pcds": pl.Utf8,
"ctry25cd": pl.Utf8,
"doterm": pl.Utf8,
"east1m": pl.Float64,
"north1m": pl.Float64,
},
).write_parquet(path)
def test_canonical_postcode_expr_formats_compact_postcodes() -> None:
df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]})
result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical"))
assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None]
def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["postcode"].to_list() == ["SW1A 1AA"]
assert loaded["pp_address"].to_list() == ["1 Example Road"]
assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"]
assert loaded["_actual_asking_price"].to_list() == [750_000]
assert loaded["_actual_lat"].to_list() == [51.5]
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
# the same digits-only key as `_normalize_uprn` on the candidate side, so
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
# and stripping non-digits would yield "1000233369560" (a bogus trailing
# zero) which never collides with the candidate key "100023336956".
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
unmatched_idxs = listings.select("_listing_idx")
seed = _build_unmatched_listing_seed_rows(unmatched_idxs, listings, template_schema)
assert seed.height == 1
assert seed["postcode"].to_list() == ["SW1A 1AA"]
assert seed["pp_address"].to_list() == ["1 Example Road"]
assert seed["pp_property_type"].to_list() == ["Terraced"]
assert seed["duration"].to_list() == ["Freehold"]
assert seed["total_floor_area"].to_list() == [120.0]
assert seed["number_habitable_rooms"].to_list() == [4]
assert seed["latest_price"].to_list() == [750_000]
# Columns not populated from the listing default to null.
assert seed["epc_address"].to_list() == [None]
# Overlay columns flow through 1:1.
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"),
pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
pl.lit("1 Example Road").alias("_direct_epc_address"),
pl.lit("C").alias("_direct_current_energy_rating"),
pl.lit("B").alias("_direct_potential_energy_rating"),
pl.lit(98.0).alias("_direct_total_floor_area"),
pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
pl.lit(2.4).alias("_direct_floor_height"),
pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"),
pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"),
pl.lit("No").alias("_direct_was_council_house"),
)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"was_council_house": pl.Utf8,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
seed = _build_unmatched_listing_seed_rows(
listings.select("_listing_idx"), listings, template_schema
)
assert seed["total_floor_area"].to_list() == [98.0]
assert seed["number_habitable_rooms"].to_list() == [4]
assert seed["epc_address"].to_list() == ["1 Example Road"]
assert seed["current_energy_rating"].to_list() == ["C"]
assert seed["was_council_house"].to_list() == ["No"]
_DIRECT_EPC_CANDIDATE_SCHEMA = {
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_uprn": pl.Utf8,
"_direct_epc_address": pl.Utf8,
"_direct_current_energy_rating": pl.Utf8,
"_direct_potential_energy_rating": pl.Utf8,
"_direct_total_floor_area": pl.Float64,
"_direct_number_habitable_rooms": pl.Int16,
"_direct_floor_height": pl.Float64,
"_direct_construction_age_band": pl.UInt16,
"_direct_is_construction_date_approximate": pl.UInt8,
"_direct_was_council_house": pl.Utf8,
}
_LISTING_MATCH_SCHEMA = {
"_listing_idx": pl.UInt32,
"_listing_match_address": pl.Utf8,
"_listing_match_postcode": pl.Utf8,
"_listing_uprn": pl.Utf8,
}
def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"_direct_epc_row": 0,
"_direct_epc_match_address": "1 EXAMPLE ROAD",
"_direct_epc_match_postcode": "AA11AA",
"_direct_epc_outcode": "AA1",
"_direct_epc_canonical_property_type": "Terraced",
"_direct_epc_uprn": None,
"_direct_epc_address": "1, Example Road",
"_direct_current_energy_rating": "C",
"_direct_potential_energy_rating": "B",
"_direct_total_floor_area": 101.0,
"_direct_number_habitable_rooms": 4,
"_direct_floor_height": 2.5,
"_direct_construction_age_band": 1930,
"_direct_is_construction_date_approximate": 1,
"_direct_was_council_house": "No",
}
return pl.DataFrame(
[{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
)
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
base = {
"_listing_idx": 0,
"_listing_match_address": "1 EXAMPLE ROAD",
"_listing_match_postcode": "AA11AA",
"_listing_uprn": None,
}
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
# UPRN is matched globally (not within a postcode bucket), so a listing
# whose detail-page postcode is slightly off still resolves to the right
# EPC certificate by its UPRN.
matches = _match_direct_epc(
_listing_matches(
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
),
_direct_epc_candidates(
[
{
"_direct_epc_uprn": "100000000001",
"_direct_epc_match_postcode": "AA11AA",
}
]
),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["address"]
def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(None) is None
assert _normalize_uprn("") is None
assert _normalize_uprn(" 100012345678 ") == "100012345678"
assert _normalize_uprn(100012345678) == "100012345678"
# An integral float normalises to its digits, NOT "1230".
assert _normalize_uprn(123.0) == "123"
# Non-integral / NaN floats are rejected rather than mangled.
assert _normalize_uprn(1.5) is None
assert _normalize_uprn(float("nan")) is None
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
# The raw property value is fill_null("No") upstream, so a plain coalesce
# would let a non-null "No" override a directly-matched listing "Yes".
# "Former council house" should fire if EITHER side says "Yes".
none_col = [None] * 5
wide = pl.LazyFrame(
{
"was_council_house": ["No", "Yes", "No", None, None],
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
"current_energy_rating": [None, "C", "D", None, None],
"_direct_current_energy_rating": ["B", "A", None, "E", None],
# _coalesce_direct_epc_columns coalesces every pair in
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
"epc_address": none_col,
"_direct_epc_address": none_col,
"potential_energy_rating": none_col,
"_direct_potential_energy_rating": none_col,
"total_floor_area": none_col,
"_direct_total_floor_area": none_col,
"number_habitable_rooms": none_col,
"_direct_number_habitable_rooms": none_col,
"floor_height": none_col,
"_direct_floor_height": none_col,
"construction_age_band": none_col,
"_direct_construction_age_band": none_col,
"is_construction_date_approximate": none_col,
"_direct_is_construction_date_approximate": none_col,
}
)
result = _coalesce_direct_epc_columns(wide).collect()
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
# Plain coalesce (raw wins when non-null) is untouched for other columns.
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
# The crime table is LEFT-joined per postcode; a postcode absent from it
# must NOT be fabricated as "zero crime" (the safest value). When every
# per-type column is null the Serious/Minor rollups must stay null.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table.
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA"],
"Violence and sexual offences (avg/yr)": [1.0],
"Robbery (avg/yr)": [2.0],
"Burglary (avg/yr)": [3.0],
"Possession of weapons (avg/yr)": [4.0],
"Anti-social behaviour (avg/yr)": [1.0],
"Criminal damage and arson (avg/yr)": [1.0],
"Shoplifting (avg/yr)": [1.0],
"Bicycle theft (avg/yr)": [1.0],
"Theft from the person (avg/yr)": [1.0],
"Other theft (avg/yr)": [1.0],
"Vehicle crime (avg/yr)": [1.0],
"Public order (avg/yr)": [1.0],
"Drugs (avg/yr)": [1.0],
"Other crime (avg/yr)": [1.0],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"Geography_code": ["E09000001", "E09000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_proximity=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame({"bb_postcode": ["AA1 1AA", "BB2 2BB"]}),
).collect()
by_postcode = {
row["postcode"]: row
for row in joined.select(
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
).iter_rows(named=True)
}
# Present postcode: rollups are the component sums (1+2+3+4, 10×1).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 10.0
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 10.0
# Missing postcode: rollups stay null rather than fabricating 0.0.
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",
"pp_address": "1 Example Road",
"_property_match_postcode": "AA11AA",
"_property_match_address": "1 EXAMPLE ROAD",
"_property_epc_match_address": "1 EXAMPLE ROAD",
"uprn": None,
}
return pl.DataFrame(
[{**base, **row} for row in rows],
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"_property_match_postcode": pl.Utf8,
"_property_match_address": pl.Utf8,
"_property_epc_match_address": pl.Utf8,
"uprn": pl.Utf8,
},
)
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
# Two listings claim the same property: one by UPRN, one by exact address
# (both score 100). The UPRN match must win even though it has the higher
# _listing_idx (which would otherwise break the tie the wrong way).
listings = _listing_matches(
[
{
"_listing_idx": 5,
"_listing_uprn": "100000000001",
"_listing_match_address": "SOMETHING ELSE",
},
{
"_listing_idx": 1,
"_listing_uprn": None,
"_listing_match_address": "1 EXAMPLE ROAD",
},
]
)
matches = _match_listing_properties(
listings, _property_candidates([{"uprn": "100000000001"}])
)
assert matches.height == 1
assert matches["_listing_idx"].to_list() == [5]
assert matches["_property_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
# same-street EPC in a different postcode with no shared UPRN is skipped.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
_direct_epc_candidates(
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
),
)
assert matches.height == 0
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA", "SW1A 1AA"],
"pp_address": ["9 Other Road", "1 Example Road"],
"pp_property_type": ["Detached", "Terraced"],
"duration": ["Freehold", "Freehold"],
"total_floor_area": [80.0, 90.0],
"number_habitable_rooms": [3, 4],
"latest_price": [500_000, 600_000],
"epc_address": [None, "1 Example Road"],
"current_energy_rating": [None, "C"],
"potential_energy_rating": [None, "B"],
"floor_height": [None, 2.4],
"construction_age_band": [None, 1930],
"is_construction_date_approximate": [None, 1],
"was_council_house": [None, "No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
other = integrated.filter(pl.col("pp_address") == "9 Other Road")
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
assert other["_actual_listing_url"].to_list() == [None]
def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
# The listing's address deliberately does not match the property's, but the
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Totally Different Road").alias("Address per Property Register"),
pl.lit("100000000009").alias("UPRN"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["1 Example Road"],
"uprn": ["100000000009"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [90.0],
"number_habitable_rooms": [4],
"latest_price": [600_000],
"epc_address": ["1 Example Road"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"uprn": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
# The listing overlay attached to the UPRN-matched property row.
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
# No spurious seed row for the listing's (non-matching) address.
assert "Totally Different Road" not in integrated["pp_address"].to_list()
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
# A number-less listing whose street is not the property's street (and which
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
# row instead of stamping the wrong property's overlay.
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Juniper Crescent").alias("Address per Property Register"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["Old Cottage High Street"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [120.0],
"number_habitable_rooms": [4],
"latest_price": [750_000],
"epc_address": ["Old Cottage High Street"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
assert existing["_actual_listing_url"].to_list() == [None]
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
None
):
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "2 Example Road"],
"Address per EPC": ["1 Example Road", None],
"Date of last transaction": [1990.0, None],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, None],
"Property type": ["Terraced", None],
"Leasehold/Freehold": ["Leasehold", None],
"Last known price": [500_000, None],
"Street tree density percentile": [42.0, 42.0],
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
"_actual_listing_url": ["url0", "url1"],
"_actual_asking_price": [600_000, 700_000],
"_actual_asking_price_per_sqm": [5_000, None],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Parking"]],
"_actual_bedrooms": [3, 4],
"_actual_bathrooms": [1, 2],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"],
"_actual_lat": [51.51, 51.52],
"_actual_lon": [-0.11, -0.12],
"_actual_total_floor_area": [110.0, None],
"_actual_number_habitable_rooms": [4, 3],
"_actual_property_type": ["Terraced", "Flats/Maisonettes"],
"_actual_leasehold_freehold": ["Freehold", "Leasehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df).sort("Address per Property Register")
assert finalized.height == 2
assert finalized["Listing URL"].to_list() == ["url0", "url1"]
assert finalized["Asking price"].to_list() == [600_000, 700_000]
assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
assert finalized["Last known price"].to_list() == [500_000, 700_000]
# Listing's preferred floor area / rooms / property type / tenure.
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows.
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
# Match status reflects historical context availability.
assert finalized["Historical property match status"].to_list() == [
"matched",
"unmatched",
]
# Overlay scaffolding is dropped.
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
assert dst not in finalized.columns, src
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
# The terminated-postcode remap can collapse two distinct wide rows onto the same
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
# must emit one row per listing URL, not one per collapsed wide row.
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "1 Example Road"],
"Address per EPC": ["1 Example Road", "1 Example Road"],
"Date of last transaction": [1990.0, 1995.0],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, 3],
"Property type": ["Terraced", "Terraced"],
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
"Last known price": [500_000, 480_000],
"Street tree density percentile": [42.0, 42.0],
# Same listing URL on both collapsed rows — the fan-out to fix.
"_actual_listing_url": ["url0", "url0"],
"_actual_asking_price": [600_000, 600_000],
"_actual_asking_price_per_sqm": [5_000, 5_000],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Garden"]],
"_actual_bedrooms": [3, 3],
"_actual_bathrooms": [1, 1],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
"_actual_lat": [51.51, 51.51],
"_actual_lon": [-0.11, -0.11],
"_actual_total_floor_area": [110.0, 110.0],
"_actual_number_habitable_rooms": [4, 4],
"_actual_property_type": ["Terraced", "Terraced"],
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df)
assert finalized.height == 1
assert finalized["Listing URL"].to_list() == ["url0"]