import polars as pl import pyarrow as pa import pytest from shapely import box, to_wkb from shapely.geometry import Point from pipeline.transform.merge import ( _AREA_COLUMNS, CONSERVATION_AREA_FEATURE, LISTED_BUILDING_FEATURE, TREE_DENSITY_FEATURE, _LISTING_OVERLAY_SOURCES, _active_english_postcode_area, _build_unmatched_listing_seed_rows, _canonical_postcode_expr, _best_listing_match, _coalesce_direct_epc_columns, _dedupe_collapsed_properties, _filter_to_active_english_postcodes, _join_area_side_tables, _finalize_listings, _integrate_listings, _match_direct_epc, _match_listing_properties, _normalize_uprn, _is_dynamic_poi_metric_column, _less_deprived_percentile_expr, _load_conservation_area_geometries, _load_listings_for_merge, _matched_listed_building_flags, _postcode_conservation_area_flags, _postcode_listed_building_candidates, _remap_terminated_postcodes, _split_normal_outputs, _tree_density_by_postcode, _validate_lad_source_coverage, _validate_lsoa_source_coverage, _validate_postcode_feature_output, _validate_property_postcodes, ) def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None: df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]}) result = ( df.lazy() .with_columns(_less_deprived_percentile_expr("Income Score (rate)")) .collect() ) assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None] def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None: df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]}) result = ( df.lazy() .with_columns(_less_deprived_percentile_expr("Income Score (rate)")) .collect() ) assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0] def test_dynamic_poi_metric_columns_are_area_level() -> None: assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)") assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)") assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km") assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km") assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km") def test_country_code_is_kept_in_postcode_area_columns() -> None: assert "ctry25cd" in _AREA_COLUMNS def test_conservation_area_feature_is_area_level() -> None: assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS def test_tree_density_is_area_level_and_survives_the_split() -> None: # Street tree density is a postcode-centroid percentile (constant per # postcode), so it must route to the postcode/area output -- not be stripped # by _area_columns_from -- and must NOT be duplicated into the property # output. Regression for the drift where it landed only in properties.parquet # and was lost for the ~308k property-less postcodes. assert TREE_DENSITY_FEATURE in _AREA_COLUMNS df = pl.DataFrame( { "Postcode": ["AA1 1AA"], "Last known price": [250_000], TREE_DENSITY_FEATURE: [42.0], } ) postcode_features = pl.DataFrame( { "Postcode": ["AA1 1AA", "BB1 1BB"], "lat": [51.0, 52.0], "lon": [-0.1, -0.2], "ctry25cd": ["E92000001", "E92000001"], TREE_DENSITY_FEATURE: [42.0, 7.0], } ) postcode_df, properties_df = _split_normal_outputs( df, postcode_features, expected_postcode_count=2 ) assert TREE_DENSITY_FEATURE in postcode_df.columns assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0] assert TREE_DENSITY_FEATURE not in properties_df.columns def test_crime_columns_are_spatial_counts_not_per_capita() -> None: # Crime is now a raw spatial count per postcode; the per-1k-residents # variants were dropped along with the LSOA population denominator. assert "Serious crime (avg/yr)" in _AREA_COLUMNS assert "Minor crime (avg/yr)" in _AREA_COLUMNS assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS def test_active_english_postcode_area_filters_to_active_england() -> None: arcgis = pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"], "ctry25cd": ["E92000001", "E92000001", "W92000004"], "doterm": [None, "2020-01-01", None], "lat": [51.0, 51.1, 52.0], "long": [-0.1, -0.2, -3.0], "lsoa21cd": ["L1", "L2", "L3"], "oa21cd": ["O1", "O2", "O3"], "pcon24cd": ["P1", "P2", "P3"], } ) result = _active_english_postcode_area(arcgis.lazy()).collect() assert result.to_dicts() == [ { "postcode": "AA1 1AA", "lat": 51.0, "lon": -0.1, "ctry25cd": "E92000001", "lsoa21": "L1", "oa21": "O1", "pcon": "P1", } ] def test_remap_then_active_filter_keeps_terminated_english_properties() -> None: wide = pl.DataFrame( { "postcode": ["OLD 1AA", "NEW 1AA", "CF1 1AA"], "row_id": [1, 2, 3], } ).lazy() mapping = pl.DataFrame( {"old_postcode": ["OLD 1AA"], "new_postcode": ["NEW 1AA"]} ).lazy() active_postcodes = pl.DataFrame({"postcode": ["NEW 1AA"]}).lazy() result = ( _filter_to_active_english_postcodes( _remap_terminated_postcodes(wide, mapping), active_postcodes ) .collect() .sort("row_id") ) assert result.to_dicts() == [ {"postcode": "NEW 1AA", "row_id": 1}, {"postcode": "NEW 1AA", "row_id": 2}, ] def test_split_normal_outputs_uses_postcode_feature_universe() -> None: df = pl.DataFrame( { "Postcode": ["AA1 1AA"], "Address per Property Register": ["1 Example Road"], "Last known price": [250_000], "lat": [51.0], "lon": [-0.1], "ctry25cd": ["E92000001"], "lsoa21": ["L1"], } ) postcode_features = pl.DataFrame( { "Postcode": ["AA1 1AA", "BB1 1BB"], "lat": [51.0, 52.0], "lon": [-0.1, -0.2], "ctry25cd": ["E92000001", "E92000001"], "lsoa21": ["L1", "L2"], "Distance to nearest amenity (Park) (km)": [0.3, 0.8], } ) postcode_df, properties_df = _split_normal_outputs( df, postcode_features, expected_postcode_count=2 ) assert postcode_df["Postcode"].to_list() == ["AA1 1AA", "BB1 1BB"] assert "Distance to nearest amenity (Park) (km)" in postcode_df.columns assert properties_df.to_dicts() == [ { "Postcode": "AA1 1AA", "Address per Property Register": "1 Example Road", "Last known price": 250_000, } ] def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() -> None: postcode_df = pl.DataFrame( { "Postcode": ["AA1 1AA", "CF1 1AA"], "lat": [51.0, None], "lon": [-0.1, None], "ctry25cd": ["E92000001", "W92000004"], } ) with pytest.raises(ValueError, match="unsupported or ungeocoded"): _validate_postcode_feature_output(postcode_df, expected_postcode_count=2) def test_postcode_feature_validation_rejects_wrong_count() -> None: # The universe-size invariant: the postcode feature output must contain # EXACTLY the active-England universe. Too few rows (silently dropped # postcodes) and too many / duplicated rows (a join fan-out) must both fail, # so neither a truncated build nor a one-to-many join can ship. too_few = pl.DataFrame( { "Postcode": ["AA1 1AA"], "lat": [51.0], "lon": [-0.1], "ctry25cd": ["E92000001"], } ) with pytest.raises(ValueError, match="active England postcode universe"): _validate_postcode_feature_output(too_few, expected_postcode_count=2) too_many = pl.DataFrame( { "Postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"], "lat": [51.0, 52.0, 53.0], "lon": [-0.1, -0.2, -0.3], "ctry25cd": ["E92000001"] * 3, } ) with pytest.raises(ValueError, match="active England postcode universe"): _validate_postcode_feature_output(too_many, expected_postcode_count=2) # Right row count but a duplicated key (n_unique < height) -- the signature of # a join fan-out. duplicated = pl.DataFrame( { "Postcode": ["AA1 1AA", "AA1 1AA"], "lat": [51.0, 51.0], "lon": [-0.1, -0.1], "ctry25cd": ["E92000001", "E92000001"], } ) with pytest.raises(ValueError, match="active England postcode universe"): _validate_postcode_feature_output(duplicated, expected_postcode_count=2) def test_join_area_side_tables_does_not_fan_out_on_unique_keys() -> None: # Soundness: with side tables unique on their join key, the per-postcode # feature joins emit exactly one row per postcode (no fan-out). A fan-out here # would inflate the postcode universe above the active-England count -- the # failure the universe assertion above is the backstop for. base = pl.LazyFrame( { "postcode": ["AA1 1AA", "BB2 2BB"], "lsoa21": ["E01000001", "E01000002"], "Local Authority District code (2024)": ["E09000001", "E09000002"], "pcon": ["E14000001", "E14000002"], } ) def _by_postcode(extra: dict) -> pl.LazyFrame: return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra}) crime = pl.LazyFrame( { "postcode": ["AA1 1AA", "BB2 2BB"], "Serious crime (avg/yr)": [1.0, 2.0], "Minor crime (avg/yr)": [3.0, 4.0], } ) joined = _join_area_side_tables( base, iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}), ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), crime=crime, median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}), poi_counts=_by_postcode({}), noise=_by_postcode({}), school_catchments=_by_postcode({}), conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}), tree_density=None, broadband=pl.LazyFrame( { "bb_postcode": ["AA1 1AA", "BB2 2BB"], "max_download_speed": pl.Series([100, 300], dtype=pl.UInt16), } ), ).collect() # One row per postcode in -> one row out; the universe is not inflated. assert joined.height == 2 assert sorted(joined["postcode"].to_list()) == ["AA1 1AA", "BB2 2BB"] def test_join_area_side_tables_normalizes_broadband_postcode_key() -> None: # Broadband comes straight from Ofcom's CSV, so its postcode can drift in # spacing/casing from the NSPL `pcds` base key. Both sides must be reduced # to the same canonical form so a real postcode populates # `max_download_speed` instead of silently missing the left join. base = pl.LazyFrame( { "postcode": ["AB1 2CD", "EF3 4GH"], "lsoa21": ["E01000001", "E01000002"], "Local Authority District code (2024)": ["E09000001", "E09000002"], "pcon": ["E14000001", "E14000002"], } ) def _by_postcode(extra: dict) -> pl.LazyFrame: return pl.LazyFrame({"postcode": ["AB1 2CD", "EF3 4GH"], **extra}) crime = pl.LazyFrame( { "postcode": ["AB1 2CD", "EF3 4GH"], "Serious crime (avg/yr)": [1.0, 2.0], "Minor crime (avg/yr)": [3.0, 4.0], } ) # AB1 2CD arrives lowercase + un-spaced; EF3 4GH arrives under two distinct # raw spellings that canonicalize to one key (the max speed must win, with # no fan-out of the base row). broadband = pl.LazyFrame( { "bb_postcode": ["ab1 2cd", "ef34gh", "EF3 4GH"], "max_download_speed": pl.Series([300, 30, 1000], dtype=pl.UInt16), } ) joined = _join_area_side_tables( base, iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}), ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), crime=crime, median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}), poi_counts=_by_postcode({}), noise=_by_postcode({}), school_catchments=_by_postcode({}), conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}), tree_density=None, broadband=broadband, ).collect() # No fan-out: still one row per base postcode. assert joined.height == 2 speeds = dict( zip(joined["postcode"].to_list(), joined["max_download_speed"].to_list()) ) # Spacing/casing drift still joins. assert speeds["AB1 2CD"] == 300 # Two raw spellings collapse to one canonical key; the max wins. assert speeds["EF3 4GH"] == 1000 # The temporary canonical join key is not leaked into the output schema. assert "_base_canonical_postcode" not in joined.columns assert "_bb_canonical_postcode" not in joined.columns assert "bb_postcode" not in joined.columns def test_listed_building_feature_is_property_level() -> None: assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS def test_postcode_conservation_area_flags_marks_point_membership() -> None: postcodes = pl.DataFrame( { "postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"], "lat": [0.5, 2.0, None], "lon": [0.5, 2.0, 0.5], } ) result = _postcode_conservation_area_flags( postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2 ).sort("postcode") assert result.to_dicts() == [ {"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"}, {"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"}, {"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"}, ] def test_load_conservation_area_geometries_uses_current_planning_data_records( monkeypatch: pytest.MonkeyPatch, tmp_path, ) -> None: real_area = box(0, 0, 1, 1) ended_area = box(2, 2, 3, 3) other_dataset_area = box(4, 4, 5, 5) point = Point(0.5, 0.5) def fake_read_arrow(path): assert path == tmp_path / "conservation_areas.geojson" table = pa.table( { "dataset": [ "conservation-area", "conservation-area", "listed-building", "conservation-area", ], "end-date": ["", "2025-01-01", "", ""], "name": ["Central Village", "Old Boundary", "Other", "Point Record"], "SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]), } ) return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow) geometries, crs = _load_conservation_area_geometries( tmp_path / "conservation_areas.geojson" ) assert crs == "EPSG:4326" assert geometries == [real_area] def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None: listed_points = pl.DataFrame( { "ListEntry": [1234, 5678], "Name": ["1 and 2 High Street", "Distant Hall"], "Grade": ["II", "I"], "Easting": [100.0, 1000.0], "Northing": [100.0, 1000.0], } ).with_columns( pl.col("Name") .str.to_uppercase() .str.replace_all(r"[^0-9A-Z]+", " ") .str.replace_all(r"\s+", " ") .str.strip_chars() .alias("_listed_match_name") ) active_postcodes = pl.DataFrame( { "postcode": ["AA1 1AA", "BB1 1BB"], "east1m": [105.0, 5000.0], "north1m": [105.0, 5000.0], } ) result = _postcode_listed_building_candidates( listed_points, active_postcodes, nearest_postcodes=1, max_distance_m=25, ) assert result.select("postcode", "_listed_match_name").to_dicts() == [ {"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"} ] def test_matched_listed_building_flags_requires_address_match() -> None: properties = pl.DataFrame( { "postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"], "pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"], "epc_address": ["1, High Street", "99, High Street", "Old Rectory"], } ) listed_candidates = pl.DataFrame( { "postcode": ["AA1 1AA", "BB1 1BB"], "_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"], "_listed_grade": ["II", "II*"], "_listed_entry": [1234, 5678], } ) result = _matched_listed_building_flags( properties.lazy(), listed_candidates, min_score=95 ).sort("postcode", "pp_address") assert result.to_dicts() == [ { "postcode": "AA1 1AA", "pp_address": "1 HIGH STREET", LISTED_BUILDING_FEATURE: "Yes", }, { "postcode": "BB1 1BB", "pp_address": "THE OLD RECTORY", LISTED_BUILDING_FEATURE: "Yes", }, ] def test_validate_property_postcodes_rejects_blank_rows() -> None: df = pl.DataFrame( { "Postcode": ["AA1 1AA", ""], "Address per Property Register": ["1 Example Street", "2 Example Street"], "Last known price": [100_000, 200_000], } ) with pytest.raises(ValueError, match="Property rows missing a postcode"): _validate_property_postcodes(df) def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads( tmp_path, ) -> None: iod_path = tmp_path / "iod.parquet" rental_path = tmp_path / "rental.parquet" pl.DataFrame( { "Local Authority District code (2024)": [ "E08000016", "E06000053", "E09000001", ], "Local Authority District name (2024)": [ "Barnsley", "Isles of Scilly", "City of London", ], } ).write_parquet(iod_path) pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet( rental_path ) _validate_lad_source_coverage(iod_path, rental_path) def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None: iod_path = tmp_path / "iod.parquet" rental_path = tmp_path / "rental.parquet" pl.DataFrame( { "Local Authority District code (2024)": ["E08000016"], "Local Authority District name (2024)": ["Barnsley"], } ).write_parquet(iod_path) pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet( rental_path ) with pytest.raises(ValueError, match="Rental data is missing"): _validate_lad_source_coverage(iod_path, rental_path) def test_validate_lsoa_source_coverage_allows_full_ethnicity_coverage( tmp_path, ) -> None: iod_path = tmp_path / "iod.parquet" ethnicity_path = tmp_path / "ethnicity.parquet" pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet( iod_path ) # Ethnicity may carry extra LSOAs (e.g. property-less ones); only the IoD # LSOAs are required to all be present. pl.DataFrame( {"lsoa21": ["E01000001", "E01000002", "E01000003"]} ).write_parquet(ethnicity_path) _validate_lsoa_source_coverage(iod_path, ethnicity_path) def test_validate_lsoa_source_coverage_rejects_missing_lsoa(tmp_path) -> None: iod_path = tmp_path / "iod.parquet" ethnicity_path = tmp_path / "ethnicity.parquet" pl.DataFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}).write_parquet( iod_path ) pl.DataFrame({"lsoa21": ["E01000001"]}).write_parquet(ethnicity_path) with pytest.raises(ValueError, match="Ethnicity data is missing LSOA coverage"): _validate_lsoa_source_coverage(iod_path, ethnicity_path) def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None: path = tmp_path / "tree_density_by_postcode.parquet" pl.DataFrame( { "postcode": ["AB1 2CD", "EF3 4GH"], "Tree canopy density percentile within 50m": [12.5, 99.0], } ).write_parquet(path) result = _tree_density_by_postcode(path).collect().sort("postcode") assert result.columns == ["postcode", TREE_DENSITY_FEATURE] assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0] assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32 def test_tree_density_by_postcode_requires_postcode_and_density_columns( tmp_path, ) -> None: path = tmp_path / "tree_density_by_postcode.parquet" pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path) with pytest.raises(ValueError, match="must contain column"): _tree_density_by_postcode(path) missing_postcode_path = tmp_path / "missing_postcode.parquet" pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet( missing_postcode_path ) with pytest.raises(ValueError, match="missing required column: postcode"): _tree_density_by_postcode(missing_postcode_path) def _sample_listings_frame() -> pl.DataFrame: return pl.DataFrame( { "Bedrooms": [3], "Bathrooms": [2], "Number of bedrooms & living rooms": [4], "lon": [-0.1], "lat": [51.5], "Postcode": ["sw1a1aa"], "Address per Property Register": ["1 Example Road"], "Leasehold/Freehold": ["Freehold"], "Property type": ["Terraced"], "Property sub-type": ["Mid-Terrace"], "Price qualifier": [""], "Total floor area (sqm)": [120.0], "Listing URL": ["https://example.test/abc"], "Listing features": [["Garden", "Off-street parking"]], "Listing date": [None], "Listing status": ["For sale"], "Asking price": [750_000], "Asking price per sqm": [6_250], }, schema={ "Bedrooms": pl.Int32, "Bathrooms": pl.Int32, "Number of bedrooms & living rooms": pl.Int32, "lon": pl.Float64, "lat": pl.Float64, "Postcode": pl.Utf8, "Address per Property Register": pl.Utf8, "Leasehold/Freehold": pl.Utf8, "Property type": pl.Utf8, "Property sub-type": pl.Utf8, "Price qualifier": pl.Utf8, "Total floor area (sqm)": pl.Float64, "Listing URL": pl.Utf8, "Listing features": pl.List(pl.Utf8), "Listing date": pl.Datetime("us"), "Listing status": pl.Utf8, "Asking price": pl.Int64, "Asking price per sqm": pl.Int32, }, ) def _stub_arcgis(path) -> None: pl.DataFrame( { "pcds": ["SW1A 1AA"], "ctry25cd": ["E92000001"], "doterm": [None], "east1m": [530000.0], "north1m": [180000.0], }, schema={ "pcds": pl.Utf8, "ctry25cd": pl.Utf8, "doterm": pl.Utf8, "east1m": pl.Float64, "north1m": pl.Float64, }, ).write_parquet(path) def test_canonical_postcode_expr_formats_compact_postcodes() -> None: df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]}) result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical")) assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None] def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns( tmp_path, ) -> None: listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().write_parquet(listings_path) _stub_arcgis(arcgis_path) loaded = _load_listings_for_merge(listings_path, arcgis_path) assert loaded["postcode"].to_list() == ["SW1A 1AA"] assert loaded["pp_address"].to_list() == ["1 Example Road"] assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"] assert loaded["_actual_asking_price"].to_list() == [750_000] assert loaded["_actual_lat"].to_list() == [51.5] def test_load_listings_for_merge_uprn_key_matches_normalize_uprn(tmp_path) -> None: # A Float UPRN (e.g. read from a NaN-bearing parquet column) must produce # the same digits-only key as `_normalize_uprn` on the candidate side, so # the exact UPRN match is not lost. Naively stringifying "100023336956.0" # and stripping non-digits would yield "1000233369560" (a bogus trailing # zero) which never collides with the candidate key "100023336956". listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().with_columns( pl.lit(100023336956.0, dtype=pl.Float64).alias("UPRN") ).write_parquet(listings_path) _stub_arcgis(arcgis_path) loaded = _load_listings_for_merge(listings_path, arcgis_path) assert loaded["_listing_uprn"].to_list() == [_normalize_uprn(100023336956.0)] assert loaded["_listing_uprn"].to_list() == ["100023336956"] def test_build_unmatched_listing_seed_rows_fills_property_shape_fields( tmp_path, ) -> None: listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().write_parquet(listings_path) _stub_arcgis(arcgis_path) listings = _load_listings_for_merge(listings_path, arcgis_path) template_schema = pl.Schema( { "postcode": pl.Utf8, "pp_address": pl.Utf8, "pp_property_type": pl.Utf8, "duration": pl.Utf8, "total_floor_area": pl.Float64, "number_habitable_rooms": pl.Int16, "latest_price": pl.Int64, "epc_address": pl.Utf8, **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES}, } ) unmatched_idxs = listings.select("_listing_idx") seed = _build_unmatched_listing_seed_rows(unmatched_idxs, listings, template_schema) assert seed.height == 1 assert seed["postcode"].to_list() == ["SW1A 1AA"] assert seed["pp_address"].to_list() == ["1 Example Road"] assert seed["pp_property_type"].to_list() == ["Terraced"] assert seed["duration"].to_list() == ["Freehold"] assert seed["total_floor_area"].to_list() == [120.0] assert seed["number_habitable_rooms"].to_list() == [4] assert seed["latest_price"].to_list() == [750_000] # Columns not populated from the listing default to null. assert seed["epc_address"].to_list() == [None] # Overlay columns flow through 1:1. assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"] def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks( tmp_path, ) -> None: listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().with_columns( pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"), pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"), ).write_parquet(listings_path) _stub_arcgis(arcgis_path) listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns( pl.lit("1 Example Road").alias("_direct_epc_address"), pl.lit("C").alias("_direct_current_energy_rating"), pl.lit("B").alias("_direct_potential_energy_rating"), pl.lit(98.0).alias("_direct_total_floor_area"), pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"), pl.lit(2.4).alias("_direct_floor_height"), pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"), pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"), pl.lit("No").alias("_direct_was_council_house"), ) template_schema = pl.Schema( { "postcode": pl.Utf8, "pp_address": pl.Utf8, "total_floor_area": pl.Float64, "number_habitable_rooms": pl.Int16, "epc_address": pl.Utf8, "current_energy_rating": pl.Utf8, "was_council_house": pl.Utf8, **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES}, } ) seed = _build_unmatched_listing_seed_rows( listings.select("_listing_idx"), listings, template_schema ) assert seed["total_floor_area"].to_list() == [98.0] assert seed["number_habitable_rooms"].to_list() == [4] assert seed["epc_address"].to_list() == ["1 Example Road"] assert seed["current_energy_rating"].to_list() == ["C"] assert seed["was_council_house"].to_list() == ["No"] def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing( tmp_path, ) -> None: # When BOTH the listing room count and a direct-EPC count exist, the EPC # value must win: the scraped "Number of bedrooms & living rooms" is actually # bedrooms + bathrooms (upstream defect), so preferring it would inflate the # count. This pins the coalesce direction (direct-EPC before listing). listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().with_columns( # The corrupt listing room count (beds + baths). pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"), ).write_parquet(listings_path) _stub_arcgis(arcgis_path) listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns( # The genuine EPC habitable-room count. pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"), ) template_schema = pl.Schema( { "postcode": pl.Utf8, "pp_address": pl.Utf8, "number_habitable_rooms": pl.Int16, **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES}, } ) seed = _build_unmatched_listing_seed_rows( listings.select("_listing_idx"), listings, template_schema ) assert seed["number_habitable_rooms"].to_list() == [3] _DIRECT_EPC_CANDIDATE_SCHEMA = { "_direct_epc_row": pl.UInt32, "_direct_epc_match_address": pl.Utf8, "_direct_epc_match_postcode": pl.Utf8, "_direct_epc_outcode": pl.Utf8, "_direct_epc_canonical_property_type": pl.Utf8, "_direct_epc_uprn": pl.Utf8, "_direct_epc_address": pl.Utf8, "_direct_current_energy_rating": pl.Utf8, "_direct_potential_energy_rating": pl.Utf8, "_direct_total_floor_area": pl.Float64, "_direct_number_habitable_rooms": pl.Int16, "_direct_floor_height": pl.Float64, "_direct_construction_age_band": pl.UInt16, "_direct_is_construction_date_approximate": pl.UInt8, "_direct_was_council_house": pl.Utf8, } _LISTING_MATCH_SCHEMA = { "_listing_idx": pl.UInt32, "_listing_match_address": pl.Utf8, "_listing_match_postcode": pl.Utf8, "_listing_uprn": pl.Utf8, } def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame: base = { "_direct_epc_row": 0, "_direct_epc_match_address": "1 EXAMPLE ROAD", "_direct_epc_match_postcode": "AA11AA", "_direct_epc_outcode": "AA1", "_direct_epc_canonical_property_type": "Terraced", "_direct_epc_uprn": None, "_direct_epc_address": "1, Example Road", "_direct_current_energy_rating": "C", "_direct_potential_energy_rating": "B", "_direct_total_floor_area": 101.0, "_direct_number_habitable_rooms": 4, "_direct_floor_height": 2.5, "_direct_construction_age_band": 1930, "_direct_is_construction_date_approximate": 1, "_direct_was_council_house": "No", } return pl.DataFrame( [{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA ) def _listing_matches(rows: list[dict]) -> pl.DataFrame: base = { "_listing_idx": 0, "_listing_match_address": "1 EXAMPLE ROAD", "_listing_match_postcode": "AA11AA", "_listing_uprn": None, } return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA) def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None: # UPRN is matched globally (not within a postcode bucket), so a listing # whose detail-page postcode is slightly off still resolves to the right # EPC certificate by its UPRN. matches = _match_direct_epc( _listing_matches( [{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}] ), _direct_epc_candidates( [ { "_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA", } ] ), ) assert matches.height == 1 assert matches["_direct_epc_address"].to_list() == ["1, Example Road"] assert matches["_direct_epc_match_method"].to_list() == ["uprn"] def test_match_direct_epc_matches_by_address_in_same_postcode() -> None: matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]), _direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]), ) assert matches.height == 1 assert matches["_direct_epc_address"].to_list() == ["1, Example Road"] assert matches["_direct_epc_match_method"].to_list() == ["address"] def test_match_direct_epc_street_fallback_matches_numberless_listing() -> None: # A street-level listing address (the Rightmove norm: no house number, no # UPRN) cannot pass the strict number gate, but must still pick up # street-representative EPC facts from a same-street certificate in its own # postcode, labelled with the lower-confidence "street" method. matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]), _direct_epc_candidates([{"_direct_epc_match_address": "7 EXAMPLE ROAD"}]), ) assert matches.height == 1 assert matches["_direct_epc_match_method"].to_list() == ["street"] def test_match_direct_epc_street_fallback_prefers_attribute_agreement() -> None: # Every same-street certificate ties on street similarity, so the listing's # attributes (floor area here) must pick the most plausible one. listings = pl.DataFrame( [ { "_listing_idx": 0, "_listing_match_address": "EXAMPLE ROAD BROMLEY", "_listing_match_postcode": "AA11AA", "_listing_uprn": None, "_actual_total_floor_area": 78.0, } ], schema={**_LISTING_MATCH_SCHEMA, "_actual_total_floor_area": pl.Float64}, ) matches = _match_direct_epc( listings, _direct_epc_candidates( [ { "_direct_epc_match_address": "7 EXAMPLE ROAD", "_direct_epc_address": "7, Example Road", "_direct_total_floor_area": 150.0, }, { "_direct_epc_row": 1, "_direct_epc_match_address": "9 EXAMPLE ROAD", "_direct_epc_address": "9, Example Road", "_direct_total_floor_area": 80.0, }, ] ), ) assert matches.height == 1 assert matches["_direct_epc_address"].to_list() == ["9, Example Road"] assert matches["_direct_epc_match_method"].to_list() == ["street"] def test_match_direct_epc_street_fallback_spans_postcodes_within_outcode() -> None: # Long streets cross postcode units. A street-only listing whose own # postcode has no certificate must still pick up a same-street certificate # from a sibling postcode in the same outcode. matches = _match_direct_epc( _listing_matches( [ { "_listing_match_address": "EXAMPLE ROAD BROMLEY", "_listing_match_postcode": "AA12ZZ", } ] ), _direct_epc_candidates( [ { "_direct_epc_match_address": "7 EXAMPLE ROAD", "_direct_epc_match_postcode": "AA11AA", } ] ), ) assert matches.height == 1 assert matches["_direct_epc_match_method"].to_list() == ["street"] def test_match_direct_epc_street_fallback_prefers_own_postcode_segment() -> None: # Within one street, the certificate in the listing's own postcode unit is # the nearest segment and must win over an equal candidate further along. matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "EXAMPLE ROAD BROMLEY"}]), _direct_epc_candidates( [ { "_direct_epc_match_address": "7 EXAMPLE ROAD", "_direct_epc_address": "7, Example Road", "_direct_epc_match_postcode": "AA12ZZ", }, { "_direct_epc_row": 1, "_direct_epc_match_address": "9 EXAMPLE ROAD", "_direct_epc_address": "9, Example Road", "_direct_epc_match_postcode": "AA11AA", }, ] ), ) assert matches.height == 1 assert matches["_direct_epc_address"].to_list() == ["9, Example Road"] def test_match_direct_epc_street_fallback_recovers_numbered_listing() -> None: # A numbered listing whose house number has no certificate (number sets # disjoint, so the strict gate skips every candidate) still picks up a # street-representative certificate via the fallback. matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "17 EXAMPLE ROAD BROMLEY"}]), _direct_epc_candidates([{"_direct_epc_match_address": "9 EXAMPLE ROAD"}]), ) assert matches.height == 1 assert matches["_direct_epc_match_method"].to_list() == ["street"] def test_match_direct_epc_street_fallback_rejects_town_only_address() -> None: # A town-only listing address ("COULSDON SURREY") shares only the locality # suffix that most street keys in the outcode carry; without a street-name # anchor it must not subset-inflate onto an arbitrary street. matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "COULSDON SURREY"}]), _direct_epc_candidates( [ { "_direct_epc_row": i, "_direct_epc_match_address": f"{number} {street} SURREY COULSDON", } for i, (number, street) in enumerate( [ ("49", "LACKFORD ROAD"), ("12", "CHIPSTEAD VALLEY ROAD"), ("3", "WINDERMERE ROAD"), ] ) ] ), ) assert matches.height == 0 def test_match_direct_epc_street_fallback_rejects_single_token_query() -> None: # token_set_ratio scores 100 whenever the query's tokens subset the # candidate's, so a bare one-token name must not street-match anything. matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "KINGSWOOD"}]), _direct_epc_candidates([{"_direct_epc_match_address": "4 KINGSWOOD ROAD"}]), ) assert matches.height == 0 def test_match_direct_epc_street_fallback_rejects_different_street() -> None: # The fallback is street-identity within the postcode, not "anything in the # postcode": a certificate on another street must not match. matches = _match_direct_epc( _listing_matches([{"_listing_match_address": "OLDSTEAD ROAD BROMLEY"}]), _direct_epc_candidates([{"_direct_epc_match_address": "5 CAMBRIDGE ROAD"}]), ) assert matches.height == 0 def test_normalize_uprn_handles_types_and_floats() -> None: assert _normalize_uprn(None) is None assert _normalize_uprn("") is None assert _normalize_uprn(" 100012345678 ") == "100012345678" assert _normalize_uprn(100012345678) == "100012345678" # An integral float normalises to its digits, NOT "1230". assert _normalize_uprn(123.0) == "123" # Non-integral / NaN floats are rejected rather than mangled. assert _normalize_uprn(1.5) is None assert _normalize_uprn(float("nan")) is None def test_coalesce_direct_epc_was_council_house_prefers_yes() -> None: # The raw property value is fill_null("No") upstream, so a plain coalesce # would let a non-null "No" override a directly-matched listing "Yes". # "Former council house" should fire if EITHER side says "Yes". none_col = [None] * 5 wide = pl.LazyFrame( { "was_council_house": ["No", "Yes", "No", None, None], "_direct_was_council_house": ["Yes", "No", None, "Yes", None], # An unrelated direct-EPC column keeps the plain-coalesce behaviour. "current_energy_rating": [None, "C", "D", None, None], "_direct_current_energy_rating": ["B", "A", None, "E", None], # _coalesce_direct_epc_columns coalesces every pair in # _DIRECT_EPC_RAW_COLUMN_MAP, so the rest must be present too. "epc_address": none_col, "_direct_epc_address": none_col, "potential_energy_rating": none_col, "_direct_potential_energy_rating": none_col, "total_floor_area": none_col, "_direct_total_floor_area": none_col, "number_habitable_rooms": none_col, "_direct_number_habitable_rooms": none_col, "floor_height": none_col, "_direct_floor_height": none_col, "construction_age_band": none_col, "_direct_construction_age_band": none_col, "is_construction_date_approximate": none_col, "_direct_is_construction_date_approximate": none_col, } ) result = _coalesce_direct_epc_columns(wide).collect() assert result["was_council_house"].to_list() == ["Yes", "Yes", "No", "Yes", None] # Plain coalesce (raw wins when non-null) is untouched for other columns. assert result["current_energy_rating"].to_list() == ["B", "C", "D", "E", None] def test_join_area_side_tables_preserves_missing_crime_as_null() -> None: # The crime table is LEFT-joined per postcode; a postcode absent from it # must NOT be fabricated as "zero crime" (the safest value). The Serious/Minor # rollups are precomputed in crime_spatial (the mean of the by-year rollup # bars), so the merge reads them straight through; a missing postcode leaves # them null. base = pl.LazyFrame( { "postcode": ["AA1 1AA", "BB2 2BB"], "lsoa21": ["E01000001", "E01000002"], "Local Authority District code (2024)": ["E09000001", "E09000002"], "pcon": ["E14000001", "E14000002"], } ) def _by_postcode(extra: dict) -> pl.LazyFrame: return pl.LazyFrame({"postcode": ["AA1 1AA", "BB2 2BB"], **extra}) # Crime is present only for AA1 1AA; BB2 2BB is absent from the table. The # rollup headlines are precomputed values (deliberately NOT the per-type sum, # which would be 10.0 each) so this test proves the merge consumes the # precomputed column rather than re-summing per-type columns. crime = pl.LazyFrame( { "postcode": ["AA1 1AA"], "Violence and sexual offences (avg/yr)": [1.0], "Robbery (avg/yr)": [2.0], "Burglary (avg/yr)": [3.0], "Possession of weapons (avg/yr)": [4.0], "Anti-social behaviour (avg/yr)": [1.0], "Criminal damage and arson (avg/yr)": [1.0], "Shoplifting (avg/yr)": [1.0], "Bicycle theft (avg/yr)": [1.0], "Theft from the person (avg/yr)": [1.0], "Other theft (avg/yr)": [1.0], "Vehicle crime (avg/yr)": [1.0], "Public order (avg/yr)": [1.0], "Drugs (avg/yr)": [1.0], "Other crime (avg/yr)": [1.0], "Serious crime (avg/yr)": [7.5], "Minor crime (avg/yr)": [4.2], } ) joined = _join_area_side_tables( base, iod=pl.LazyFrame({"LSOA code (2021)": ["E01000001", "E01000002"]}), ethnicity=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), crime=crime, median_age=pl.LazyFrame({"lsoa21": ["E01000001", "E01000002"]}), election=pl.LazyFrame({"pcon": ["E14000001", "E14000002"]}), poi_counts=_by_postcode({}), noise=_by_postcode({}), school_catchments=_by_postcode({}), conservation_areas=_by_postcode({CONSERVATION_AREA_FEATURE: ["Yes", "No"]}), tree_density=None, broadband=pl.LazyFrame( { "bb_postcode": ["AA1 1AA", "BB2 2BB"], "max_download_speed": pl.Series([100, 300], dtype=pl.UInt16), } ), ).collect() by_postcode = { row["postcode"]: row for row in joined.select( "postcode", "serious_crime_avg_yr", "minor_crime_avg_yr" ).iter_rows(named=True) } # Present postcode: rollups are the precomputed headline values, read through # unchanged (NOT the per-type sum of 10.0). assert by_postcode["AA1 1AA"]["serious_crime_avg_yr"] == 7.5 assert by_postcode["AA1 1AA"]["minor_crime_avg_yr"] == 4.2 # Missing postcode: rollups stay null rather than fabricating 0.0. assert by_postcode["BB2 2BB"]["serious_crime_avg_yr"] is None assert by_postcode["BB2 2BB"]["minor_crime_avg_yr"] is None def test_dedupe_collapsed_properties_keeps_most_recent_per_address() -> None: # The terminated-postcode remap can merge two distinct postcodes onto one # active successor, collapsing the same physical address onto a single # (postcode, pp_address) key with conflicting sale records. The dedup must # keep exactly one row per (postcode, pp_address) -- the most recent # transaction -- and must not collapse genuinely distinct addresses. from datetime import datetime wide = pl.LazyFrame( { "postcode": ["SW3 3JY", "SW3 3JY", "SW3 3JY"], "pp_address": ["45 ELYSTAN PLACE", "45 ELYSTAN PLACE", "9 OTHER ROAD"], "date_of_transfer": [ datetime(1990, 1, 1), datetime(2015, 6, 1), datetime(2000, 1, 1), ], "latest_price": [1_587_700, 4_500_000, 250_000], } ) out = _dedupe_collapsed_properties(wide).collect() # One row per (postcode, pp_address): the two ELYSTAN PLACE rows collapse to one. assert out.height == 2 assert out.select(["postcode", "pp_address"]).is_unique().all() by_addr = {r["pp_address"]: r for r in out.iter_rows(named=True)} # The kept ELYSTAN PLACE row is the most recent transaction (2015 @ 4.5M), # not an arbitrary one. assert by_addr["45 ELYSTAN PLACE"]["date_of_transfer"] == datetime(2015, 6, 1) assert by_addr["45 ELYSTAN PLACE"]["latest_price"] == 4_500_000 # A genuinely distinct address in the same postcode is untouched. assert by_addr["9 OTHER ROAD"]["latest_price"] == 250_000 def _property_candidates(rows: list[dict]) -> pl.DataFrame: base = { "postcode": "AA1 1AA", "pp_address": "1 Example Road", "_property_match_postcode": "AA11AA", "_property_match_address": "1 EXAMPLE ROAD", "_property_epc_match_address": "1 EXAMPLE ROAD", "uprn": None, } return pl.DataFrame( [{**base, **row} for row in rows], schema={ "postcode": pl.Utf8, "pp_address": pl.Utf8, "_property_match_postcode": pl.Utf8, "_property_match_address": pl.Utf8, "_property_epc_match_address": pl.Utf8, "uprn": pl.Utf8, }, ) def test_match_listing_properties_uprn_wins_dedup_tie() -> None: # Two listings claim the same property: one by UPRN, one by exact address # (both score 100). The UPRN match must win even though it has the higher # _listing_idx (which would otherwise break the tie the wrong way). listings = _listing_matches( [ { "_listing_idx": 5, "_listing_uprn": "100000000001", "_listing_match_address": "SOMETHING ELSE", }, { "_listing_idx": 1, "_listing_uprn": None, "_listing_match_address": "1 EXAMPLE ROAD", }, ] ) matches = _match_listing_properties( listings, _property_candidates([{"uprn": "100000000001"}]) ) assert matches.height == 1 assert matches["_listing_idx"].to_list() == [5] assert matches["_property_match_method"].to_list() == ["uprn"] def test_match_direct_epc_does_not_match_other_outcode_without_uprn() -> None: # Matching is by postcode/UPRN/street — never by coordinate proximity — and # the street fallback is outcode-scoped, so a same-street EPC in a different # OUTCODE with no shared UPRN is skipped. matches = _match_direct_epc( _listing_matches([{"_listing_match_postcode": "AA11AA"}]), _direct_epc_candidates( [ { "_direct_epc_match_postcode": "BB22BB", "_direct_epc_outcode": "BB2", "_direct_epc_uprn": None, } ] ), ) assert matches.height == 0 def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None: listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().write_parquet(listings_path) _stub_arcgis(arcgis_path) wide = pl.DataFrame( { "postcode": ["SW1A 1AA", "SW1A 1AA"], "pp_address": ["9 Other Road", "1 Example Road"], "pp_property_type": ["Detached", "Terraced"], "duration": ["Freehold", "Freehold"], "total_floor_area": [80.0, 90.0], "number_habitable_rooms": [3, 4], "latest_price": [500_000, 600_000], "epc_address": [None, "1 Example Road"], "current_energy_rating": [None, "C"], "potential_energy_rating": [None, "B"], "floor_height": [None, 2.4], "construction_age_band": [None, 1930], "is_construction_date_approximate": [None, 1], "was_council_house": [None, "No"], }, schema={ "postcode": pl.Utf8, "pp_address": pl.Utf8, "pp_property_type": pl.Utf8, "duration": pl.Utf8, "total_floor_area": pl.Float64, "number_habitable_rooms": pl.Int16, "latest_price": pl.Int64, "epc_address": pl.Utf8, "current_energy_rating": pl.Utf8, "potential_energy_rating": pl.Utf8, "floor_height": pl.Float64, "construction_age_band": pl.UInt16, "is_construction_date_approximate": pl.UInt8, "was_council_house": pl.Utf8, }, ) integrated = _integrate_listings( wide.lazy(), listings_path, arcgis_path, epc_path=None ).collect() matched = integrated.filter(pl.col("pp_address") == "1 Example Road") other = integrated.filter(pl.col("pp_address") == "9 Other Road") assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"] assert other["_actual_listing_url"].to_list() == [None] def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None: # The listing's address deliberately does not match the property's, but the # shared UPRN drives an exact match anyway (UPRN beats fuzzy street). listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().with_columns( pl.lit("Totally Different Road").alias("Address per Property Register"), pl.lit("100000000009").alias("UPRN"), ).write_parquet(listings_path) _stub_arcgis(arcgis_path) wide = pl.DataFrame( { "postcode": ["SW1A 1AA"], "pp_address": ["1 Example Road"], "uprn": ["100000000009"], "pp_property_type": ["Terraced"], "duration": ["Freehold"], "total_floor_area": [90.0], "number_habitable_rooms": [4], "latest_price": [600_000], "epc_address": ["1 Example Road"], "current_energy_rating": ["C"], "potential_energy_rating": ["B"], "floor_height": [2.4], "construction_age_band": [1930], "is_construction_date_approximate": [1], "was_council_house": ["No"], }, schema={ "postcode": pl.Utf8, "pp_address": pl.Utf8, "uprn": pl.Utf8, "pp_property_type": pl.Utf8, "duration": pl.Utf8, "total_floor_area": pl.Float64, "number_habitable_rooms": pl.Int16, "latest_price": pl.Int64, "epc_address": pl.Utf8, "current_energy_rating": pl.Utf8, "potential_energy_rating": pl.Utf8, "floor_height": pl.Float64, "construction_age_band": pl.UInt16, "is_construction_date_approximate": pl.UInt8, "was_council_house": pl.Utf8, }, ) integrated = _integrate_listings( wide.lazy(), listings_path, arcgis_path, epc_path=None ).collect() matched = integrated.filter(pl.col("pp_address") == "1 Example Road") # The listing overlay attached to the UPRN-matched property row. assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"] # No spurious seed row for the listing's (non-matching) address. assert "Totally Different Road" not in integrated["pp_address"].to_list() def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None: # A number-less listing whose street is not the property's street (and which # shares no UPRN) must not be force-matched onto it; it becomes its own seed # row instead of stamping the wrong property's overlay. listings_path = tmp_path / "listings.parquet" arcgis_path = tmp_path / "arcgis.parquet" _sample_listings_frame().with_columns( pl.lit("Juniper Crescent").alias("Address per Property Register"), ).write_parquet(listings_path) _stub_arcgis(arcgis_path) wide = pl.DataFrame( { "postcode": ["SW1A 1AA"], "pp_address": ["Old Cottage High Street"], "pp_property_type": ["Terraced"], "duration": ["Freehold"], "total_floor_area": [120.0], "number_habitable_rooms": [4], "latest_price": [750_000], "epc_address": ["Old Cottage High Street"], "current_energy_rating": ["C"], "potential_energy_rating": ["B"], "floor_height": [2.4], "construction_age_band": [1930], "is_construction_date_approximate": [1], "was_council_house": ["No"], }, schema={ "postcode": pl.Utf8, "pp_address": pl.Utf8, "pp_property_type": pl.Utf8, "duration": pl.Utf8, "total_floor_area": pl.Float64, "number_habitable_rooms": pl.Int16, "latest_price": pl.Int64, "epc_address": pl.Utf8, "current_energy_rating": pl.Utf8, "potential_energy_rating": pl.Utf8, "floor_height": pl.Float64, "construction_age_band": pl.UInt16, "is_construction_date_approximate": pl.UInt8, "was_council_house": pl.Utf8, }, ) integrated = _integrate_listings( wide.lazy(), listings_path, arcgis_path, epc_path=None ).collect() existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street") seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent") assert existing["_actual_listing_url"].to_list() == [None] assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"] def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> ( None ): # Regression: a number-less listing (street/locality only) must NOT match a # numbered property. The number gate is unconditional (like fuzzy_join), and # the score is token_sort_ratio only, so a single locality token can no # longer subset-inflate to 100 against a long numbered address. candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}] result = _best_listing_match( listing_uprn=None, query="KINGSWOOD", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is None def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None: # A number-less listing CAN still match a number-less (named-house) property # when the street/name matches almost exactly. candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}] result = _best_listing_match( listing_uprn=None, query="WOODLANDS HOUSE OAK LANE", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is not None candidate, score, method, field = result assert method == "address" assert score >= 90.0 def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> ( None ): # No regression for numbered listings: the number gate still permits a # compatible house number and the lower with-numbers threshold applies. candidates = [{"pp_address": "10 OAK LANE"}] result = _best_listing_match( listing_uprn=None, query="10 OAK LANE", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is not None _candidate, score, method, _field = result assert method == "address" assert score >= 82.0 def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> ( None ): # A scraped numbered listing often appends town/county tokens that the bare # Price-Paid register address omits. token_sort alone would score this ~73 # (below 82) and drop a correct match; token_set (allowed for numbered # queries, where the number gate makes it safe) recovers it. candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}] result = _best_listing_match( listing_uprn=None, query="105 RIDGEWAY DRIVE BROMLEY KENT", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is not None candidate, score, _method, _field = result assert candidate["pp_address"] == "105 RIDGEWAY DRIVE" assert score >= 82.0 def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> ( None ): # token_set for numbered queries is safe only because the number gate runs # first: a query and candidate with incompatible house numbers never reach # scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE". candidates = [{"pp_address": "12 OAK LANE KINGSTON"}] result = _best_listing_match( listing_uprn=None, query="10 OAK LANE", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is None def test_best_listing_match_letter_suffix_flats_do_not_cross_match() -> None: # Regression: the gate uses fuzzy_join's suffix-aware tokens, so "8A" and # "8B" are different numbers. Under the old digit-only tokens both looked # like {8} and token_sort scored ~93, attaching the wrong flat's record # whenever the true candidate was absent from the bucket. candidates = [{"pp_address": "8B HIGH STREET"}] result = _best_listing_match( listing_uprn=None, query="8A HIGH STREET", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is None def test_best_listing_match_building_listing_cannot_absorb_single_flat() -> None: # Regression: set equality (not subset) over number tokens, so a whole- # building listing "188 GREAT NORTH WAY" no longer matches "FLAT 1 188 # GREAT NORTH WAY" (token_set would have scored the pair 100). candidates = [{"pp_address": "FLAT 1 188 GREAT NORTH WAY"}] result = _best_listing_match( listing_uprn=None, query="188 GREAT NORTH WAY", uprn_index={}, bucket_candidates=candidates, addressed_fields=["pp_address"], ) assert result is None def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> ( None ): df = pl.DataFrame( { "Postcode": ["SW1A 1AA", "SW1A 1AA"], "Address per Property Register": ["1 Example Road", "2 Example Road"], "Address per EPC": ["1 Example Road", None], "Date of last transaction": [1990.0, None], "lat": [51.5, 51.5], "lon": [-0.1, -0.1], "Total floor area (sqm)": [100.0, 95.0], "Number of bedrooms & living rooms": [3, None], "Property type": ["Terraced", None], "Leasehold/Freehold": ["Leasehold", None], "Last known price": [500_000, None], "Tree canopy density percentile": [42.0, 42.0], # Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none. "_actual_listing_url": ["url0", "url1"], "_actual_asking_price": [600_000, 700_000], "_actual_asking_price_per_sqm": [5_000, None], "_actual_listing_date": [None, None], "_actual_listing_status": ["For sale", "For sale"], "_actual_listing_features": [["Garden"], ["Parking"]], "_actual_bedrooms": [3, 4], "_actual_bathrooms": [1, 2], "_actual_price_qualifier": ["", ""], "_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"], "_actual_lat": [51.51, 51.52], "_actual_lon": [-0.11, -0.12], "_actual_total_floor_area": [110.0, None], "_actual_number_habitable_rooms": [4, 3], "_actual_property_type": ["Terraced", "Flats/Maisonettes"], "_actual_leasehold_freehold": ["Freehold", "Leasehold"], }, schema={ "Postcode": pl.Utf8, "Address per Property Register": pl.Utf8, "Address per EPC": pl.Utf8, "Date of last transaction": pl.Float64, "lat": pl.Float64, "lon": pl.Float64, "Total floor area (sqm)": pl.Float64, "Number of bedrooms & living rooms": pl.Int16, "Property type": pl.Utf8, "Leasehold/Freehold": pl.Utf8, "Last known price": pl.Int64, "Tree canopy density percentile": pl.Float32, "_actual_listing_url": pl.Utf8, "_actual_asking_price": pl.Int64, "_actual_asking_price_per_sqm": pl.Int32, "_actual_listing_date": pl.Datetime("us"), "_actual_listing_status": pl.Utf8, "_actual_listing_features": pl.List(pl.Utf8), "_actual_bedrooms": pl.Int32, "_actual_bathrooms": pl.Int32, "_actual_price_qualifier": pl.Utf8, "_actual_property_sub_type": pl.Utf8, "_actual_lat": pl.Float64, "_actual_lon": pl.Float64, "_actual_total_floor_area": pl.Float64, "_actual_number_habitable_rooms": pl.Int16, "_actual_property_type": pl.Utf8, "_actual_leasehold_freehold": pl.Utf8, }, ) finalized = _finalize_listings(df).sort("Address per Property Register") assert finalized.height == 2 assert finalized["Listing URL"].to_list() == ["url0", "url1"] assert finalized["Asking price"].to_list() == [600_000, 700_000] assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368] assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368] assert finalized["Estimated current price"].to_list() == [600_000, 700_000] assert finalized["Last known price"].to_list() == [500_000, 700_000] # Listing's preferred floor area / property type / tenure. assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0] # Rooms prefer the EPC habitable-room count over the listing's beds+baths # value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no # EPC count so it falls back to the listing's 3. assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3] assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"] assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"] # Postcode-level feature carried through to both matched and unmatched rows. assert finalized["Tree canopy density percentile"].to_list() == [42.0, 42.0] # Match status reflects historical context availability. assert finalized["Historical property match status"].to_list() == [ "matched", "unmatched", ] # Overlay scaffolding is dropped. for src, dst, _dt in _LISTING_OVERLAY_SOURCES: assert dst not in finalized.columns, src def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None: # The terminated-postcode remap can collapse two distinct wide rows onto the same # (postcode, pp_address), so a single matched listing attaches to both. Finalize # must emit one row per listing URL, not one per collapsed wide row. df = pl.DataFrame( { "Postcode": ["SW1A 1AA", "SW1A 1AA"], "Address per Property Register": ["1 Example Road", "1 Example Road"], "Address per EPC": ["1 Example Road", "1 Example Road"], "Date of last transaction": [1990.0, 1995.0], "lat": [51.5, 51.5], "lon": [-0.1, -0.1], "Total floor area (sqm)": [100.0, 95.0], "Number of bedrooms & living rooms": [3, 3], "Property type": ["Terraced", "Terraced"], "Leasehold/Freehold": ["Leasehold", "Leasehold"], "Last known price": [500_000, 480_000], "Tree canopy density percentile": [42.0, 42.0], # Same listing URL on both collapsed rows — the fan-out to fix. "_actual_listing_url": ["url0", "url0"], "_actual_asking_price": [600_000, 600_000], "_actual_asking_price_per_sqm": [5_000, 5_000], "_actual_listing_date": [None, None], "_actual_listing_status": ["For sale", "For sale"], "_actual_listing_features": [["Garden"], ["Garden"]], "_actual_bedrooms": [3, 3], "_actual_bathrooms": [1, 1], "_actual_price_qualifier": ["", ""], "_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"], "_actual_lat": [51.51, 51.51], "_actual_lon": [-0.11, -0.11], "_actual_total_floor_area": [110.0, 110.0], "_actual_number_habitable_rooms": [4, 4], "_actual_property_type": ["Terraced", "Terraced"], "_actual_leasehold_freehold": ["Freehold", "Freehold"], }, schema={ "Postcode": pl.Utf8, "Address per Property Register": pl.Utf8, "Address per EPC": pl.Utf8, "Date of last transaction": pl.Float64, "lat": pl.Float64, "lon": pl.Float64, "Total floor area (sqm)": pl.Float64, "Number of bedrooms & living rooms": pl.Int16, "Property type": pl.Utf8, "Leasehold/Freehold": pl.Utf8, "Last known price": pl.Int64, "Tree canopy density percentile": pl.Float32, "_actual_listing_url": pl.Utf8, "_actual_asking_price": pl.Int64, "_actual_asking_price_per_sqm": pl.Int32, "_actual_listing_date": pl.Datetime("us"), "_actual_listing_status": pl.Utf8, "_actual_listing_features": pl.List(pl.Utf8), "_actual_bedrooms": pl.Int32, "_actual_bathrooms": pl.Int32, "_actual_price_qualifier": pl.Utf8, "_actual_property_sub_type": pl.Utf8, "_actual_lat": pl.Float64, "_actual_lon": pl.Float64, "_actual_total_floor_area": pl.Float64, "_actual_number_habitable_rooms": pl.Int16, "_actual_property_type": pl.Utf8, "_actual_leasehold_freehold": pl.Utf8, }, ) finalized = _finalize_listings(df) assert finalized.height == 1 assert finalized["Listing URL"].to_list() == ["url0"]