perfect-postcode/pipeline/transform/test_merge.py
Andras Schmelczer f59d01227b
Some checks failed
Build and publish Docker image / build-and-push (push) Failing after 15s
CI / Check (push) Failing after 1m58s
SPlit up
2026-06-12 21:51:37 +01:00

1810 lines
69 KiB
Python

import polars as pl
import pyarrow as pa
import pytest
from shapely import box, to_wkb
from shapely.geometry import Point
from pipeline.transform.merge import (
_AREA_COLUMNS,
CONSERVATION_AREA_FEATURE,
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_LISTING_OVERLAY_SOURCES,
_active_english_postcode_area,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_best_listing_match,
_coalesce_direct_epc_columns,
_dedupe_collapsed_properties,
_filter_to_active_english_postcodes,
_join_area_side_tables,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
_match_listing_properties,
_normalize_uprn,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_load_conservation_area_geometries,
_load_listings_for_merge,
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
_remap_terminated_postcodes,
_split_normal_outputs,
_tree_density_by_postcode,
_validate_lad_source_coverage,
_validate_lsoa_source_coverage,
_validate_postcode_feature_output,
_validate_property_postcodes,
)
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
result = (
df.lazy()
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
.collect()
)
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
result = (
df.lazy()
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
.collect()
)
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
def test_dynamic_poi_metric_columns_are_area_level() -> None:
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
def test_country_code_is_kept_in_postcode_area_columns() -> None:
assert "ctry25cd" in _AREA_COLUMNS
def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_tree_density_is_area_level_and_survives_the_split() -> None:
# Street tree density is a postcode-centroid percentile (constant per
# postcode), so it must route to the postcode/area output -- not be stripped
# by _area_columns_from -- and must NOT be duplicated into the property
# output. Regression for the drift where it landed only in properties.parquet
# and was lost for the ~308k property-less postcodes.
assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Last known price": [250_000],
TREE_DENSITY_FEATURE: [42.0],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
TREE_DENSITY_FEATURE: [42.0, 7.0],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert TREE_DENSITY_FEATURE in postcode_df.columns
assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
assert TREE_DENSITY_FEATURE not in properties_df.columns
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
# Crime is now a raw spatial count per postcode; the per-1k-residents
# variants were dropped along with the LSOA population denominator.
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
def test_active_english_postcode_area_filters_to_active_england() -> None:
arcgis = pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
"doterm": [None, "2020-01-01", None],
"lat": [51.0, 51.1, 52.0],
"long": [-0.1, -0.2, -3.0],
"lsoa21cd": ["L1", "L2", "L3"],
"oa21cd": ["O1", "O2", "O3"],
"pcon24cd": ["P1", "P2", "P3"],
}
)
result = _active_english_postcode_area(arcgis.lazy()).collect()
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"lat": 51.0,
"lon": -0.1,
"ctry25cd": "E92000001",
"lsoa21": "L1",
"oa21": "O1",
"pcon": "P1",
}
]
def test_remap_then_active_filter_keeps_terminated_english_properties() -> None:
wide = pl.DataFrame(
{
"postcode": ["OLD 1AA", "NEW 1AA", "CF1 1AA"],
"row_id": [1, 2, 3],
}
).lazy()
mapping = pl.DataFrame(
{"old_postcode": ["OLD 1AA"], "new_postcode": ["NEW 1AA"]}
).lazy()
active_postcodes = pl.DataFrame({"postcode": ["NEW 1AA"]}).lazy()
result = (
_filter_to_active_english_postcodes(
_remap_terminated_postcodes(wide, mapping), active_postcodes
)
.collect()
.sort("row_id")
)
assert result.to_dicts() == [
{"postcode": "NEW 1AA", "row_id": 1},
{"postcode": "NEW 1AA", "row_id": 2},
]
def test_split_normal_outputs_uses_postcode_feature_universe() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Address per Property Register": ["1 Example Road"],
"Last known price": [250_000],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
"lsoa21": ["L1"],
}
)
postcode_features = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB"],
"lat": [51.0, 52.0],
"lon": [-0.1, -0.2],
"ctry25cd": ["E92000001", "E92000001"],
"lsoa21": ["L1", "L2"],
"Distance to nearest amenity (Park) (km)": [0.3, 0.8],
}
)
postcode_df, properties_df = _split_normal_outputs(
df, postcode_features, expected_postcode_count=2
)
assert postcode_df["Postcode"].to_list() == ["AA1 1AA", "BB1 1BB"]
assert "Distance to nearest amenity (Park) (km)" in postcode_df.columns
assert properties_df.to_dicts() == [
{
"Postcode": "AA1 1AA",
"Address per Property Register": "1 Example Road",
"Last known price": 250_000,
}
]
def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() -> None:
postcode_df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "CF1 1AA"],
"lat": [51.0, None],
"lon": [-0.1, None],
"ctry25cd": ["E92000001", "W92000004"],
}
)
with pytest.raises(ValueError, match="unsupported or ungeocoded"):
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
def test_postcode_feature_validation_rejects_wrong_count() -> None:
# The universe-size invariant: the postcode feature output must contain
# EXACTLY the active-England universe. Too few rows (silently dropped
# postcodes) and too many / duplicated rows (a join fan-out) must both fail,
# so neither a truncated build nor a one-to-many join can ship.
too_few = pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"lat": [51.0],
"lon": [-0.1],
"ctry25cd": ["E92000001"],
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(too_few, expected_postcode_count=2)
too_many = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
"lat": [51.0, 52.0, 53.0],
"lon": [-0.1, -0.2, -0.3],
"ctry25cd": ["E92000001"] * 3,
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(too_many, expected_postcode_count=2)
# Right row count but a duplicated key (n_unique < height) -- the signature of
# a join fan-out.
duplicated = pl.DataFrame(
{
"Postcode": ["AA1 1AA", "AA1 1AA"],
"lat": [51.0, 51.0],
"lon": [-0.1, -0.1],
"ctry25cd": ["E92000001", "E92000001"],
}
)
with pytest.raises(ValueError, match="active England postcode universe"):
_validate_postcode_feature_output(duplicated, expected_postcode_count=2)
def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None:
# Soundness: with side tables unique on their join key, the per-postcode
# feature joins emit exactly one row per postcode (no fan-out). A fan-out here
# would inflate the postcode universe above the active-England count -- the
# failure the universe assertion above is the backstop for.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"Serious crime (avg/yr)": [1.0, 2.0],
"Minor crime (avg/yr)": [3.0, 4.0],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(
{
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
}
),
).collect()
# One row per postcode in -> one row out; the universe is not inflated.
assert joined.height == 2
assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"]
def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None:
# Broadband comes straight from Ofcom's CSV, so its postcode can drift in
# spacing/casing from the NSPL `pcds` base key. Both sides must be reduced
# to the same canonical form so a real postcode populates
# `max_download_speed` instead of silently missing the left join.
base = pl.LazyFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra})
crime = pl.LazyFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"Serious crime (avg/yr)": [1.0, 2.0],
"Minor crime (avg/yr)": [3.0, 4.0],
}
)
# AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct
# raw spellings that canonicalize to one key (the max speed must win, with
# no fan-out of the base row).
broadband = pl.LazyFrame(
{
"bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"],
"max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16),
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=broadband,
).collect()
# No fan-out: still one row per base postcode.
assert joined.height == 2
speeds = dict(
zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list())
)
# Spacing/casing drift still joins.
assert speeds["AB1 2CD"] == 300
# Two raw spellings collapse to one canonical key; the max wins.
assert speeds["EF3 4GH"] == 1000
# The temporary canonical join key is not leaked into the output schema.
assert "_base_canonical_postcode" not in joined.columns
assert "_bb_canonical_postcode" not in joined.columns
assert "bb_postcode" not in joined.columns
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
def test_postcode_conservation_area_flags_marks_point_membership() -> None:
postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
"lat": [0.5, 2.0, None],
"lon": [0.5, 2.0, 0.5],
}
)
result = _postcode_conservation_area_flags(
postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
).sort("postcode")
assert result.to_dicts() == [
{"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
{"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
{"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
]
def test_load_conservation_area_geometries_uses_current_planning_data_records(
monkeypatch: pytest.MonkeyPatch,
tmp_path,
) -> None:
real_area = box(0, 0, 1, 1)
ended_area = box(2, 2, 3, 3)
other_dataset_area = box(4, 4, 5, 5)
point = Point(0.5, 0.5)
def fake_read_arrow(path):
assert path == tmp_path / "conservation_areas.geojson"
table = pa.table(
{
"dataset": [
"conservation-area",
"conservation-area",
"listed-building",
"conservation-area",
],
"end-date": ["", "2025-01-01", "", ""],
"name": ["Central Village", "Old Boundary", "Other", "Point Record"],
"SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]),
}
)
return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
geometries, crs = _load_conservation_area_geometries(
tmp_path / "conservation_areas.geojson"
)
assert crs == "EPSG:4326"
assert geometries == [real_area]
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
listed_points = pl.DataFrame(
{
"ListEntry": [1234, 5678],
"Name": ["1 and 2 High Street", "Distant Hall"],
"Grade": ["II", "I"],
"Easting": [100.0, 1000.0],
"Northing": [100.0, 1000.0],
}
).with_columns(
pl.col("Name")
.str.to_uppercase()
.str.replace_all(r"[^0-9A-Z]+", " ")
.str.replace_all(r"\s+", " ")
.str.strip_chars()
.alias("_listed_match_name")
)
active_postcodes = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB"],
"east1m": [105.0, 5000.0],
"north1m": [105.0, 5000.0],
}
)
result = _postcode_listed_building_candidates(
listed_points,
active_postcodes,
nearest_postcodes=1,
max_distance_m=25,
)
assert result.select("postcode", "_listed_match_name").to_dicts() == [
{"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
]
def test_matched_listed_building_flags_requires_address_match() -> None:
properties = pl.DataFrame(
{
"postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
"pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
"epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
}
)
listed_candidates = pl.DataFrame(
{
"postcode": ["AA1 1AA", "BB1 1BB"],
"_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
"_listed_grade": ["II", "II*"],
"_listed_entry": [1234, 5678],
}
)
result = _matched_listed_building_flags(
properties.lazy(), listed_candidates, min_score=95
).sort("postcode", "pp_address")
assert result.to_dicts() == [
{
"postcode": "AA1 1AA",
"pp_address": "1 HIGH STREET",
LISTED_BUILDING_FEATURE: "Yes",
},
{
"postcode": "BB1 1BB",
"pp_address": "THE OLD RECTORY",
LISTED_BUILDING_FEATURE: "Yes",
},
]
def test_validate_property_postcodes_rejects_blank_rows() -> None:
df = pl.DataFrame(
{
"Postcode": ["AA1 1AA", ""],
"Address per Property Register": ["1 Example Street", "2 Example Street"],
"Last known price": [100_000, 200_000],
}
)
with pytest.raises(ValueError, match="Property rows missing a postcode"):
_validate_property_postcodes(df)
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
"Local Authority District code (2024)": [
"E08000016",
"E06000053",
"E09000001",
],
"Local Authority District name (2024)": [
"Barnsley",
"Isles of Scilly",
"City of London",
],
}
).write_parquet(iod_path)
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
rental_path
)
_validate_lad_source_coverage(iod_path, rental_path)
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
rental_path = tmp_path / "rental.parquet"
pl.DataFrame(
{
"Local Authority District code (2024)": ["E08000016"],
"Local Authority District name (2024)": ["Barnsley"],
}
).write_parquet(iod_path)
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
rental_path
)
with pytest.raises(ValueError, match="Rental data is missing"):
_validate_lad_source_coverage(iod_path, rental_path)
def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage(
tmp_path,
) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
iod_path
)
# Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD
# LSOAs are required to all be present.
pl.DataFrame(
{"lsoa21": ["E01000001", "E01000002", "E01000003"]}
).write_parquet(ethnicity_path)
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None:
iod_path = tmp_path / "iod.parquet"
ethnicity_path = tmp_path / "ethnicity.parquet"
pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet(
iod_path
)
pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path)
with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"):
_validate_lsoa_source_coverage(iod_path, ethnicity_path)
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
path = tmp_path / "tree_density_by_postcode.parquet"
pl.DataFrame(
{
"postcode": ["AB1 2CD", "EF3 4GH"],
"Tree canopy density percentile within 50m": [12.5, 99.0],
}
).write_parquet(path)
result = _tree_density_by_postcode(path).collect().sort("postcode")
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
tmp_path,
) -> None:
path = tmp_path / "tree_density_by_postcode.parquet"
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
with pytest.raises(ValueError, match="must contain column"):
_tree_density_by_postcode(path)
missing_postcode_path = tmp_path / "missing_postcode.parquet"
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
missing_postcode_path
)
with pytest.raises(ValueError, match="missing required column: postcode"):
_tree_density_by_postcode(missing_postcode_path)
def _sample_listings_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"Bedrooms": [3],
"Bathrooms": [2],
"Number of bedrooms & living rooms": [4],
"lon": [-0.1],
"lat": [51.5],
"Postcode": ["sw1a1aa"],
"Address per Property Register": ["1 Example Road"],
"Leasehold/Freehold": ["Freehold"],
"Property type": ["Terraced"],
"Property sub-type": ["Mid-Terrace"],
"Price qualifier": [""],
"Total floor area (sqm)": [120.0],
"Listing URL": ["https://example.test/abc"],
"Listing features": [["Garden", "Off-street parking"]],
"Listing date": [None],
"Listing status": ["For sale"],
"Asking price": [750_000],
"Asking price per sqm": [6_250],
},
schema={
"Bedrooms": pl.Int32,
"Bathrooms": pl.Int32,
"Number of bedrooms & living rooms": pl.Int32,
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,
"Price qualifier": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Listing URL": pl.Utf8,
"Listing features": pl.List(pl.Utf8),
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
"Asking price per sqm": pl.Int32,
},
)
def _stub_arcgis(path) -> None:
pl.DataFrame(
{
"pcds": ["SW1A 1AA"],
"ctry25cd": ["E92000001"],
"doterm": [None],
"east1m": [530000.0],
"north1m": [180000.0],
},
schema={
"pcds": pl.Utf8,
"ctry25cd": pl.Utf8,
"doterm": pl.Utf8,
"east1m": pl.Float64,
"north1m": pl.Float64,
},
).write_parquet(path)
def test_canonical_postcode_expr_formats_compact_postcodes() -> None:
df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]})
result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical"))
assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None]
def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["postcode"].to_list() == ["SW1A 1AA"]
assert loaded["pp_address"].to_list() == ["1 Example Road"]
assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"]
assert loaded["_actual_asking_price"].to_list() == [750_000]
assert loaded["_actual_lat"].to_list() == [51.5]
def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None:
# A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce
# the same digits-only key as `_normalize_uprn` on the candidate side, so
# the exact UPRN match is not lost. Naively stringifying "100023336956.0"
# and stripping non-digits would yield "1000233369560" (a bogus trailing
# zero) which never collides with the candidate key "100023336956".
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN")
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)]
assert loaded["_listing_uprn"].to_list() == ["100023336956"]
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
unmatched_idxs = listings.select("_listing_idx")
seed = _build_unmatched_listing_seed_rows(unmatched_idxs, listings, template_schema)
assert seed.height == 1
assert seed["postcode"].to_list() == ["SW1A 1AA"]
assert seed["pp_address"].to_list() == ["1 Example Road"]
assert seed["pp_property_type"].to_list() == ["Terraced"]
assert seed["duration"].to_list() == ["Freehold"]
assert seed["total_floor_area"].to_list() == [120.0]
assert seed["number_habitable_rooms"].to_list() == [4]
assert seed["latest_price"].to_list() == [750_000]
# Columns not populated from the listing default to null.
assert seed["epc_address"].to_list() == [None]
# Overlay columns flow through 1:1.
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"),
pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
pl.lit("1 Example Road").alias("_direct_epc_address"),
pl.lit("C").alias("_direct_current_energy_rating"),
pl.lit("B").alias("_direct_potential_energy_rating"),
pl.lit(98.0).alias("_direct_total_floor_area"),
pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
pl.lit(2.4).alias("_direct_floor_height"),
pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"),
pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"),
pl.lit("No").alias("_direct_was_council_house"),
)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"was_council_house": pl.Utf8,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
seed = _build_unmatched_listing_seed_rows(
listings.select("_listing_idx"), listings, template_schema
)
assert seed["total_floor_area"].to_list() == [98.0]
assert seed["number_habitable_rooms"].to_list() == [4]
assert seed["epc_address"].to_list() == ["1 Example Road"]
assert seed["current_energy_rating"].to_list() == ["C"]
assert seed["was_council_house"].to_list() == ["No"]
def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
tmp_path,
) -> None:
# When BOTH the listing room count and a direct-EPC count exist, the EPC
# value must win: the scraped "Number of bedrooms & living rooms" is actually
# bedrooms + bathrooms (upstream defect), so preferring it would inflate the
# count. This pins the coalesce direction (direct-EPC before listing).
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
# The corrupt listing room count (beds + baths).
pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
# The genuine EPC habitable-room count.
pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"number_habitable_rooms": pl.Int16,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
seed = _build_unmatched_listing_seed_rows(
listings.select("_listing_idx"), listings, template_schema
)
assert seed["number_habitable_rooms"].to_list() == [3]
_DIRECT_EPC_CANDIDATE_SCHEMA = {
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_uprn": pl.Utf8,
"_direct_epc_address": pl.Utf8,
"_direct_current_energy_rating": pl.Utf8,
"_direct_potential_energy_rating": pl.Utf8,
"_direct_total_floor_area": pl.Float64,
"_direct_number_habitable_rooms": pl.Int16,
"_direct_floor_height": pl.Float64,
"_direct_construction_age_band": pl.UInt16,
"_direct_is_construction_date_approximate": pl.UInt8,
"_direct_was_council_house": pl.Utf8,
}
_LISTING_MATCH_SCHEMA = {
"_listing_idx": pl.UInt32,
"_listing_match_address": pl.Utf8,
"_listing_match_postcode": pl.Utf8,
"_listing_uprn": pl.Utf8,
}
def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"_direct_epc_row": 0,
"_direct_epc_match_address": "1 EXAMPLE ROAD",
"_direct_epc_match_postcode": "AA11AA",
"_direct_epc_outcode": "AA1",
"_direct_epc_canonical_property_type": "Terraced",
"_direct_epc_uprn": None,
"_direct_epc_address": "1, Example Road",
"_direct_current_energy_rating": "C",
"_direct_potential_energy_rating": "B",
"_direct_total_floor_area": 101.0,
"_direct_number_habitable_rooms": 4,
"_direct_floor_height": 2.5,
"_direct_construction_age_band": 1930,
"_direct_is_construction_date_approximate": 1,
"_direct_was_council_house": "No",
}
return pl.DataFrame(
[{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
)
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
base = {
"_listing_idx": 0,
"_listing_match_address": "1 EXAMPLE ROAD",
"_listing_match_postcode": "AA11AA",
"_listing_uprn": None,
}
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
# UPRN is matched globally (not within a postcode bucket), so a listing
# whose detail-page postcode is slightly off still resolves to the right
# EPC certificate by its UPRN.
matches = _match_direct_epc(
_listing_matches(
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
),
_direct_epc_candidates(
[
{
"_direct_epc_uprn": "100000000001",
"_direct_epc_match_postcode": "AA11AA",
}
]
),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["address"]
def test_match_direct_epc_street_fallback_matches_numberless_listing() -> None:
# A street-level listing address (the Rightmove norm: no house number, no
# UPRN) cannot pass the strict number gate, but must still pick up
# street-representative EPC facts from a same-street certificate in its own
# postcode, labelled with the lower-confidence "street" method.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "7 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_prefers_attribute_agreement() -> None:
# Every same-street certificate ties on street similarity, so the listing's
# attributes (floor area here) must pick the most plausible one.
listings = pl.DataFrame(
[
{
"_listing_idx": 0,
"_listing_match_address": "EXAMPLE ROAD BROMLEY",
"_listing_match_postcode": "AA11AA",
"_listing_uprn": None,
"_actual_total_floor_area": 78.0,
}
],
schema={**_LISTING_MATCH_SCHEMA, "_actual_total_floor_area": pl.Float64},
)
matches = _match_direct_epc(
listings,
_direct_epc_candidates(
[
{
"_direct_epc_match_address": "7 EXAMPLE ROAD",
"_direct_epc_address": "7, Example Road",
"_direct_total_floor_area": 150.0,
},
{
"_direct_epc_row": 1,
"_direct_epc_match_address": "9 EXAMPLE ROAD",
"_direct_epc_address": "9, Example Road",
"_direct_total_floor_area": 80.0,
},
]
),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["9, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_spans_postcodes_within_outcode() -> None:
# Long streets cross postcode units. A street-only listing whose own
# postcode has no certificate must still pick up a same-street certificate
# from a sibling postcode in the same outcode.
matches = _match_direct_epc(
_listing_matches(
[
{
"_listing_match_address": "EXAMPLE ROAD BROMLEY",
"_listing_match_postcode": "AA12ZZ",
}
]
),
_direct_epc_candidates(
[
{
"_direct_epc_match_address": "7 EXAMPLE ROAD",
"_direct_epc_match_postcode": "AA11AA",
}
]
),
)
assert matches.height == 1
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_prefers_own_postcode_segment() -> None:
# Within one street, the certificate in the listing's own postcode unit is
# the nearest segment and must win over an equal candidate further along.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]),
_direct_epc_candidates(
[
{
"_direct_epc_match_address": "7 EXAMPLE ROAD",
"_direct_epc_address": "7, Example Road",
"_direct_epc_match_postcode": "AA12ZZ",
},
{
"_direct_epc_row": 1,
"_direct_epc_match_address": "9 EXAMPLE ROAD",
"_direct_epc_address": "9, Example Road",
"_direct_epc_match_postcode": "AA11AA",
},
]
),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["9, Example Road"]
def test_match_direct_epc_street_fallback_recovers_numbered_listing() -> None:
# A numbered listing whose house number has no certificate (number sets
# disjoint, so the strict gate skips every candidate) still picks up a
# street-representative certificate via the fallback.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "17 EXAMPLE ROAD BROMLEY"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "9 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_match_method"].to_list() == ["street"]
def test_match_direct_epc_street_fallback_rejects_town_only_address() -> None:
# A town-only listing address ("COULSDON SURREY") shares only the locality
# suffix that most street keys in the outcode carry; without a street-name
# anchor it must not subset-inflate onto an arbitrary street.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "COULSDON SURREY"}]),
_direct_epc_candidates(
[
{
"_direct_epc_row": i,
"_direct_epc_match_address": f"{number} {street} SURREY COULSDON",
}
for i, (number, street) in enumerate(
[
("49", "LACKFORD ROAD"),
("12", "CHIPSTEAD VALLEY ROAD"),
("3", "WINDERMERE ROAD"),
]
)
]
),
)
assert matches.height == 0
def test_match_direct_epc_street_fallback_rejects_single_token_query() -> None:
# token_set_ratio scores 100 whenever the query's tokens subset the
# candidate's, so a bare one-token name must not street-match anything.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "KINGSWOOD"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "4 KINGSWOOD ROAD"}]),
)
assert matches.height == 0
def test_match_direct_epc_street_fallback_rejects_different_street() -> None:
# The fallback is street-identity within the postcode, not "anything in the
# postcode": a certificate on another street must not match.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "OLDSTEAD ROAD BROMLEY"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "5 CAMBRIDGE ROAD"}]),
)
assert matches.height == 0
def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(None) is None
assert _normalize_uprn("") is None
assert _normalize_uprn(" 100012345678 ") == "100012345678"
assert _normalize_uprn(100012345678) == "100012345678"
# An integral float normalises to its digits, NOT "1230".
assert _normalize_uprn(123.0) == "123"
# Non-integral / NaN floats are rejected rather than mangled.
assert _normalize_uprn(1.5) is None
assert _normalize_uprn(float("nan")) is None
def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None:
# The raw property value is fill_null("No") upstream, so a plain coalesce
# would let a non-null "No" override a directly-matched listing "Yes".
# "Former council house" should fire if EITHER side says "Yes".
none_col = [None] * 5
wide = pl.LazyFrame(
{
"was_council_house": ["No", "Yes", "No", None, None],
"_direct_was_council_house": ["Yes", "No", None, "Yes", None],
# An unrelated direct-EPC column keeps the plain-coalesce behaviour.
"current_energy_rating": [None, "C", "D", None, None],
"_direct_current_energy_rating": ["B", "A", None, "E", None],
# _coalesce_direct_epc_columns coalesces every pair in
# _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too.
"epc_address": none_col,
"_direct_epc_address": none_col,
"potential_energy_rating": none_col,
"_direct_potential_energy_rating": none_col,
"total_floor_area": none_col,
"_direct_total_floor_area": none_col,
"number_habitable_rooms": none_col,
"_direct_number_habitable_rooms": none_col,
"floor_height": none_col,
"_direct_floor_height": none_col,
"construction_age_band": none_col,
"_direct_construction_age_band": none_col,
"is_construction_date_approximate": none_col,
"_direct_is_construction_date_approximate": none_col,
}
)
result = _coalesce_direct_epc_columns(wide).collect()
assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None]
# Plain coalesce (raw wins when non-null) is untouched for other columns.
assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None]
def test_join_area_side_tables_preserves_missing_crime_as_null() -> None:
# The crime table is LEFT-joined per postcode; a postcode absent from it
# must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor
# rollups are precomputed in crime_spatial (the mean of the by-year rollup
# bars), so the merge reads them straight through; a missing postcode leaves
# them null.
base = pl.LazyFrame(
{
"postcode": ["AA1 1AA", "BB2 2BB"],
"lsoa21": ["E01000001", "E01000002"],
"Local Authority District code (2024)": ["E09000001", "E09000002"],
"pcon": ["E14000001", "E14000002"],
}
)
def _by_postcode(extra: dict) -> pl.LazyFrame:
return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra})
# Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The
# rollup headlines are precomputed values (deliberately NOT the per-type sum,
# which would be 10.0 each) so this test proves the merge consumes the
# precomputed column rather than re-summing per-type columns.
crime = pl.LazyFrame(
{
"postcode": ["AA1 1AA"],
"Violence and sexual offences (avg/yr)": [1.0],
"Robbery (avg/yr)": [2.0],
"Burglary (avg/yr)": [3.0],
"Possession of weapons (avg/yr)": [4.0],
"Anti-social behaviour (avg/yr)": [1.0],
"Criminal damage and arson (avg/yr)": [1.0],
"Shoplifting (avg/yr)": [1.0],
"Bicycle theft (avg/yr)": [1.0],
"Theft from the person (avg/yr)": [1.0],
"Other theft (avg/yr)": [1.0],
"Vehicle crime (avg/yr)": [1.0],
"Public order (avg/yr)": [1.0],
"Drugs (avg/yr)": [1.0],
"Other crime (avg/yr)": [1.0],
"Serious crime (avg/yr)": [7.5],
"Minor crime (avg/yr)": [4.2],
}
)
joined = _join_area_side_tables(
base,
iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}),
ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
crime=crime,
median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}),
election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}),
poi_counts=_by_postcode({}),
noise=_by_postcode({}),
school_catchments=_by_postcode({}),
conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}),
tree_density=None,
broadband=pl.LazyFrame(
{
"bb_postcode": ["AA1 1AA", "BB2 2BB"],
"max_download_speed": pl.Series([100, 300], dtype=pl.UInt16),
}
),
).collect()
by_postcode = {
row["postcode"]: row
for row in joined.select(
"postcode", "serious_crime_avg_yr", "minor_crime_avg_yr"
).iter_rows(named=True)
}
# Present postcode: rollups are the precomputed headline values, read through
# unchanged (NOT the per-type sum of 10.0).
assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5
assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2
# Missing postcode: rollups stay null rather than fabricating 0.0.
assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None
assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None
def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None:
# The terminated-postcode remap can merge two distinct postcodes onto one
# active successor, collapsing the same physical address onto a single
# (postcode, pp_address) key with conflicting sale records. The dedup must
# keep exactly one row per (postcode, pp_address) -- the most recent
# transaction -- and must not collapse genuinely distinct addresses.
from datetime import datetime
wide = pl.LazyFrame(
{
"postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"],
"pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"],
"date_of_transfer": [
datetime(1990, 1, 1),
datetime(2015, 6, 1),
datetime(2000, 1, 1),
],
"latest_price": [1_587_700, 4_500_000, 250_000],
}
)
out = _dedupe_collapsed_properties(wide).collect()
# One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one.
assert out.height == 2
assert out.select(["postcode", "pp_address"]).is_unique().all()
by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)}
# The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M),
# not an arbitrary one.
assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1)
assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000
# A genuinely distinct address in the same postcode is untouched.
assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",
"pp_address": "1 Example Road",
"_property_match_postcode": "AA11AA",
"_property_match_address": "1 EXAMPLE ROAD",
"_property_epc_match_address": "1 EXAMPLE ROAD",
"uprn": None,
}
return pl.DataFrame(
[{**base, **row} for row in rows],
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"_property_match_postcode": pl.Utf8,
"_property_match_address": pl.Utf8,
"_property_epc_match_address": pl.Utf8,
"uprn": pl.Utf8,
},
)
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
# Two listings claim the same property: one by UPRN, one by exact address
# (both score 100). The UPRN match must win even though it has the higher
# _listing_idx (which would otherwise break the tie the wrong way).
listings = _listing_matches(
[
{
"_listing_idx": 5,
"_listing_uprn": "100000000001",
"_listing_match_address": "SOMETHING ELSE",
},
{
"_listing_idx": 1,
"_listing_uprn": None,
"_listing_match_address": "1 EXAMPLE ROAD",
},
]
)
matches = _match_listing_properties(
listings, _property_candidates([{"uprn": "100000000001"}])
)
assert matches.height == 1
assert matches["_listing_idx"].to_list() == [5]
assert matches["_property_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_does_not_match_other_outcode_without_uprn() -> None:
# Matching is by postcode/UPRN/street — never by coordinate proximity — and
# the street fallback is outcode-scoped, so a same-street EPC in a different
# OUTCODE with no shared UPRN is skipped.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
_direct_epc_candidates(
[
{
"_direct_epc_match_postcode": "BB22BB",
"_direct_epc_outcode": "BB2",
"_direct_epc_uprn": None,
}
]
),
)
assert matches.height == 0
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA", "SW1A 1AA"],
"pp_address": ["9 Other Road", "1 Example Road"],
"pp_property_type": ["Detached", "Terraced"],
"duration": ["Freehold", "Freehold"],
"total_floor_area": [80.0, 90.0],
"number_habitable_rooms": [3, 4],
"latest_price": [500_000, 600_000],
"epc_address": [None, "1 Example Road"],
"current_energy_rating": [None, "C"],
"potential_energy_rating": [None, "B"],
"floor_height": [None, 2.4],
"construction_age_band": [None, 1930],
"is_construction_date_approximate": [None, 1],
"was_council_house": [None, "No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
other = integrated.filter(pl.col("pp_address") == "9 Other Road")
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
assert other["_actual_listing_url"].to_list() == [None]
def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
# The listing's address deliberately does not match the property's, but the
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Totally Different Road").alias("Address per Property Register"),
pl.lit("100000000009").alias("UPRN"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["1 Example Road"],
"uprn": ["100000000009"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [90.0],
"number_habitable_rooms": [4],
"latest_price": [600_000],
"epc_address": ["1 Example Road"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"uprn": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
# The listing overlay attached to the UPRN-matched property row.
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
# No spurious seed row for the listing's (non-matching) address.
assert "Totally Different Road" not in integrated["pp_address"].to_list()
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
# A number-less listing whose street is not the property's street (and which
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
# row instead of stamping the wrong property's overlay.
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Juniper Crescent").alias("Address per Property Register"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["Old Cottage High Street"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [120.0],
"number_habitable_rooms": [4],
"latest_price": [750_000],
"epc_address": ["Old Cottage High Street"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
assert existing["_actual_listing_url"].to_list() == [None]
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
None
):
# Regression: a number-less listing (street/locality only) must NOT match a
# numbered property. The number gate is unconditional (like fuzzy_join), and
# the score is token_sort_ratio only, so a single locality token can no
# longer subset-inflate to 100 against a long numbered address.
candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
result = _best_listing_match(
listing_uprn=None,
query="KINGSWOOD",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
# A number-less listing CAN still match a number-less (named-house) property
# when the street/name matches almost exactly.
candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
result = _best_listing_match(
listing_uprn=None,
query="WOODLANDS HOUSE OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
candidate, score, method, field = result
assert method == "address"
assert score >= 90.0
def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
None
):
# No regression for numbered listings: the number gate still permits a
# compatible house number and the lower with-numbers threshold applies.
candidates = [{"pp_address": "10 OAK LANE"}]
result = _best_listing_match(
listing_uprn=None,
query="10 OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
_candidate, score, method, _field = result
assert method == "address"
assert score >= 82.0
def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
None
):
# A scraped numbered listing often appends town/county tokens that the bare
# Price-Paid register address omits. token_sort alone would score this ~73
# (below 82) and drop a correct match; token_set (allowed for numbered
# queries, where the number gate makes it safe) recovers it.
candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
result = _best_listing_match(
listing_uprn=None,
query="105 RIDGEWAY DRIVE BROMLEY KENT",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is not None
candidate, score, _method, _field = result
assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
assert score >= 82.0
def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
None
):
# token_set for numbered queries is safe only because the number gate runs
# first: a query and candidate with incompatible house numbers never reach
# scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
result = _best_listing_match(
listing_uprn=None,
query="10 OAK LANE",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_best_listing_match_letter_suffix_flats_do_not_cross_match() -> None:
# Regression: the gate uses fuzzy_join's suffix-aware tokens, so "8A" and
# "8B" are different numbers. Under the old digit-only tokens both looked
# like {8} and token_sort scored ~93, attaching the wrong flat's record
# whenever the true candidate was absent from the bucket.
candidates = [{"pp_address": "8B HIGH STREET"}]
result = _best_listing_match(
listing_uprn=None,
query="8A HIGH STREET",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_best_listing_match_building_listing_cannot_absorb_single_flat() -> None:
# Regression: set equality (not subset) over number tokens, so a whole-
# building listing "188 GREAT NORTH WAY" no longer matches "FLAT 1 188
# GREAT NORTH WAY" (token_set would have scored the pair 100).
candidates = [{"pp_address": "FLAT 1 188 GREAT NORTH WAY"}]
result = _best_listing_match(
listing_uprn=None,
query="188 GREAT NORTH WAY",
uprn_index={},
bucket_candidates=candidates,
addressed_fields=["pp_address"],
)
assert result is None
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
None
):
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "2 Example Road"],
"Address per EPC": ["1 Example Road", None],
"Date of last transaction": [1990.0, None],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, None],
"Property type": ["Terraced", None],
"Leasehold/Freehold": ["Leasehold", None],
"Last known price": [500_000, None],
"Tree canopy density percentile": [42.0, 42.0],
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
"_actual_listing_url": ["url0", "url1"],
"_actual_asking_price": [600_000, 700_000],
"_actual_asking_price_per_sqm": [5_000, None],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Parking"]],
"_actual_bedrooms": [3, 4],
"_actual_bathrooms": [1, 2],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"],
"_actual_lat": [51.51, 51.52],
"_actual_lon": [-0.11, -0.12],
"_actual_total_floor_area": [110.0, None],
"_actual_number_habitable_rooms": [4, 3],
"_actual_property_type": ["Terraced", "Flats/Maisonettes"],
"_actual_leasehold_freehold": ["Freehold", "Leasehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Tree canopy density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df).sort("Address per Property Register")
assert finalized.height == 2
assert finalized["Listing URL"].to_list() == ["url0", "url1"]
assert finalized["Asking price"].to_list() == [600_000, 700_000]
assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
assert finalized["Last known price"].to_list() == [500_000, 700_000]
# Listing's preferred floor area / property type / tenure.
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
# Rooms prefer the EPC habitable-room count over the listing's beds+baths
# value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
# EPC count so it falls back to the listing's 3.
assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows.
assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0]
# Match status reflects historical context availability.
assert finalized["Historical property match status"].to_list() == [
"matched",
"unmatched",
]
# Overlay scaffolding is dropped.
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
assert dst not in finalized.columns, src
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
# The terminated-postcode remap can collapse two distinct wide rows onto the same
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
# must emit one row per listing URL, not one per collapsed wide row.
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "1 Example Road"],
"Address per EPC": ["1 Example Road", "1 Example Road"],
"Date of last transaction": [1990.0, 1995.0],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, 3],
"Property type": ["Terraced", "Terraced"],
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
"Last known price": [500_000, 480_000],
"Tree canopy density percentile": [42.0, 42.0],
# Same listing URL on both collapsed rows — the fan-out to fix.
"_actual_listing_url": ["url0", "url0"],
"_actual_asking_price": [600_000, 600_000],
"_actual_asking_price_per_sqm": [5_000, 5_000],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Garden"]],
"_actual_bedrooms": [3, 3],
"_actual_bathrooms": [1, 1],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
"_actual_lat": [51.51, 51.51],
"_actual_lon": [-0.11, -0.11],
"_actual_total_floor_area": [110.0, 110.0],
"_actual_number_habitable_rooms": [4, 4],
"_actual_property_type": ["Terraced", "Terraced"],
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Tree canopy density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df)
assert finalized.height == 1
assert finalized["Listing URL"].to_list() == ["url0"]