import polars as pl
import pyarrow as pa
import pytest
from shapely import box, to_wkb
from shapely.geometry import Point

from pipeline.transform.merge import (
    _AREA_COLUMNS,
    CONSERVATION_AREA_FEATURE,
    LISTED_BUILDING_FEATURE,
    TREE_DENSITY_FEATURE,
    _LISTING_OVERLAY_SOURCES,
    _build_unmatched_listing_seed_rows,
    _canonical_postcode_expr,
    _finalize_listings,
    _integrate_listings,
    _match_direct_epc,
    _is_dynamic_poi_metric_column,
    _less_deprived_percentile_expr,
    _load_conservation_area_geometries,
    _load_listings_for_merge,
    _matched_listed_building_flags,
    _postcode_conservation_area_flags,
    _postcode_listed_building_candidates,
    _tree_density_by_postcode,
    _validate_lad_source_coverage,
    _validate_property_postcodes,
)


def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
    df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})

    result = (
        df.lazy()
        .with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
        .collect()
    )

    assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]


def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
    df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})

    result = (
        df.lazy()
        .with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
        .collect()
    )

    assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]


def test_dynamic_poi_metric_columns_are_area_level() -> None:
    assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
    assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
    assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
    assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
    assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")


def test_country_code_is_kept_in_postcode_area_columns() -> None:
    assert "ctry25cd" in _AREA_COLUMNS


def test_conservation_area_feature_is_area_level() -> None:
    assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS


def test_listed_building_feature_is_property_level() -> None:
    assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS


def test_postcode_conservation_area_flags_marks_point_membership() -> None:
    postcodes = pl.DataFrame(
        {
            "postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
            "lat": [0.5, 2.0, None],
            "lon": [0.5, 2.0, 0.5],
        }
    )

    result = _postcode_conservation_area_flags(
        postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
    ).sort("postcode")

    assert result.to_dicts() == [
        {"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
        {"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
        {"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
    ]


def test_load_conservation_area_geometries_uses_current_planning_data_records(
    monkeypatch: pytest.MonkeyPatch,
    tmp_path,
) -> None:
    real_area = box(0, 0, 1, 1)
    ended_area = box(2, 2, 3, 3)
    other_dataset_area = box(4, 4, 5, 5)
    point = Point(0.5, 0.5)

    def fake_read_arrow(path):
        assert path == tmp_path / "conservation_areas.geojson"
        table = pa.table(
            {
                "dataset": [
                    "conservation-area",
                    "conservation-area",
                    "listed-building",
                    "conservation-area",
                ],
                "end-date": ["", "2025-01-01", "", ""],
                "name": ["Central Village", "Old Boundary", "Other", "Point Record"],
                "SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]),
            }
        )
        return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table

    monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)

    geometries, crs = _load_conservation_area_geometries(
        tmp_path / "conservation_areas.geojson"
    )

    assert crs == "EPSG:4326"
    assert geometries == [real_area]


def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
    listed_points = pl.DataFrame(
        {
            "ListEntry": [1234, 5678],
            "Name": ["1 and 2 High Street", "Distant Hall"],
            "Grade": ["II", "I"],
            "Easting": [100.0, 1000.0],
            "Northing": [100.0, 1000.0],
        }
    ).with_columns(
        pl.col("Name")
        .str.to_uppercase()
        .str.replace_all(r"[^0-9A-Z]+", " ")
        .str.replace_all(r"\s+", " ")
        .str.strip_chars()
        .alias("_listed_match_name")
    )
    active_postcodes = pl.DataFrame(
        {
            "postcode": ["AA1 1AA", "BB1 1BB"],
            "east1m": [105.0, 5000.0],
            "north1m": [105.0, 5000.0],
        }
    )

    result = _postcode_listed_building_candidates(
        listed_points,
        active_postcodes,
        nearest_postcodes=1,
        max_distance_m=25,
    )

    assert result.select("postcode", "_listed_match_name").to_dicts() == [
        {"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
    ]


def test_matched_listed_building_flags_requires_address_match() -> None:
    properties = pl.DataFrame(
        {
            "postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
            "pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
            "epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
        }
    )
    listed_candidates = pl.DataFrame(
        {
            "postcode": ["AA1 1AA", "BB1 1BB"],
            "_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
            "_listed_grade": ["II", "II*"],
            "_listed_entry": [1234, 5678],
        }
    )

    result = _matched_listed_building_flags(
        properties.lazy(), listed_candidates, min_score=95
    ).sort("postcode", "pp_address")

    assert result.to_dicts() == [
        {
            "postcode": "AA1 1AA",
            "pp_address": "1 HIGH STREET",
            LISTED_BUILDING_FEATURE: "Yes",
        },
        {
            "postcode": "BB1 1BB",
            "pp_address": "THE OLD RECTORY",
            LISTED_BUILDING_FEATURE: "Yes",
        },
    ]


def test_validate_property_postcodes_rejects_blank_rows() -> None:
    df = pl.DataFrame(
        {
            "Postcode": ["AA1 1AA", ""],
            "Address per Property Register": ["1 Example Street", "2 Example Street"],
            "Last known price": [100_000, 200_000],
        }
    )

    with pytest.raises(ValueError, match="Property rows missing a postcode"):
        _validate_property_postcodes(df)


def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
    tmp_path,
) -> None:
    iod_path = tmp_path / "iod.parquet"
    ethnicity_path = tmp_path / "ethnicity.parquet"
    rental_path = tmp_path / "rental.parquet"
    pl.DataFrame(
        {
            "Local Authority District code (2024)": [
                "E08000016",
                "E06000053",
                "E09000001",
            ],
            "Local Authority District name (2024)": [
                "Barnsley",
                "Isles of Scilly",
                "City of London",
            ],
        }
    ).write_parquet(iod_path)
    pl.DataFrame(
        {"Geography_code": ["E08000016", "E06000053", "E09000001"]}
    ).write_parquet(ethnicity_path)
    pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
        rental_path
    )

    _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)


def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
    iod_path = tmp_path / "iod.parquet"
    ethnicity_path = tmp_path / "ethnicity.parquet"
    rental_path = tmp_path / "rental.parquet"
    pl.DataFrame(
        {
            "Local Authority District code (2024)": ["E08000016"],
            "Local Authority District name (2024)": ["Barnsley"],
        }
    ).write_parquet(iod_path)
    pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
    pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
        rental_path
    )

    with pytest.raises(ValueError, match="Rental data is missing"):
        _validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)


def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
    path = tmp_path / "tree_density_by_postcode.parquet"
    pl.DataFrame(
        {
            "postcode": ["AB1 2CD", "EF3 4GH"],
            "Tree canopy density percentile within 50m": [12.5, 99.0],
        }
    ).write_parquet(path)

    result = _tree_density_by_postcode(path).collect().sort("postcode")

    assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
    assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
    assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32


def test_tree_density_by_postcode_requires_postcode_and_density_columns(
    tmp_path,
) -> None:
    path = tmp_path / "tree_density_by_postcode.parquet"
    pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)

    with pytest.raises(ValueError, match="must contain column"):
        _tree_density_by_postcode(path)

    missing_postcode_path = tmp_path / "missing_postcode.parquet"
    pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
        missing_postcode_path
    )

    with pytest.raises(ValueError, match="missing required column: postcode"):
        _tree_density_by_postcode(missing_postcode_path)


def _sample_listings_frame() -> pl.DataFrame:
    return pl.DataFrame(
        {
            "Bedrooms": [3],
            "Bathrooms": [2],
            "Number of bedrooms & living rooms": [4],
            "lon": [-0.1],
            "lat": [51.5],
            "Postcode": ["sw1a1aa"],
            "Address per Property Register": ["1 Example Road"],
            "Leasehold/Freehold": ["Freehold"],
            "Property type": ["Terraced"],
            "Property sub-type": ["Mid-Terrace"],
            "Price qualifier": [""],
            "Total floor area (sqm)": [120.0],
            "Listing URL": ["https://example.test/abc"],
            "Listing features": [["Garden", "Off-street parking"]],
            "Listing date": [None],
            "Listing status": ["For sale"],
            "Asking price": [750_000],
            "Asking price per sqm": [6_250],
        },
        schema={
            "Bedrooms": pl.Int32,
            "Bathrooms": pl.Int32,
            "Number of bedrooms & living rooms": pl.Int32,
            "lon": pl.Float64,
            "lat": pl.Float64,
            "Postcode": pl.Utf8,
            "Address per Property Register": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Property type": pl.Utf8,
            "Property sub-type": pl.Utf8,
            "Price qualifier": pl.Utf8,
            "Total floor area (sqm)": pl.Float64,
            "Listing URL": pl.Utf8,
            "Listing features": pl.List(pl.Utf8),
            "Listing date": pl.Datetime("us"),
            "Listing status": pl.Utf8,
            "Asking price": pl.Int64,
            "Asking price per sqm": pl.Int32,
        },
    )


def _stub_arcgis(path) -> None:
    pl.DataFrame(
        {
            "pcds": ["SW1A 1AA"],
            "ctry25cd": ["E92000001"],
            "doterm": [None],
            "east1m": [530000.0],
            "north1m": [180000.0],
        },
        schema={
            "pcds": pl.Utf8,
            "ctry25cd": pl.Utf8,
            "doterm": pl.Utf8,
            "east1m": pl.Float64,
            "north1m": pl.Float64,
        },
    ).write_parquet(path)


def test_canonical_postcode_expr_formats_compact_postcodes() -> None:
    df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]})
    result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical"))
    assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None]


def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
    tmp_path,
) -> None:
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().write_parquet(listings_path)
    _stub_arcgis(arcgis_path)

    loaded = _load_listings_for_merge(listings_path, arcgis_path)

    assert loaded["postcode"].to_list() == ["SW1A 1AA"]
    assert loaded["pp_address"].to_list() == ["1 Example Road"]
    assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"]
    assert loaded["_actual_asking_price"].to_list() == [750_000]
    assert loaded["_actual_lat"].to_list() == [51.5]


def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
    tmp_path,
) -> None:
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().write_parquet(listings_path)
    _stub_arcgis(arcgis_path)

    listings = _load_listings_for_merge(listings_path, arcgis_path)
    template_schema = pl.Schema(
        {
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "pp_property_type": pl.Utf8,
            "duration": pl.Utf8,
            "total_floor_area": pl.Float64,
            "number_habitable_rooms": pl.Int16,
            "latest_price": pl.Int64,
            "epc_address": pl.Utf8,
            **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
        }
    )
    unmatched_idxs = listings.select("_listing_idx")

    seed = _build_unmatched_listing_seed_rows(
        unmatched_idxs, listings, template_schema
    )

    assert seed.height == 1
    assert seed["postcode"].to_list() == ["SW1A 1AA"]
    assert seed["pp_address"].to_list() == ["1 Example Road"]
    assert seed["pp_property_type"].to_list() == ["Terraced"]
    assert seed["duration"].to_list() == ["Freehold"]
    assert seed["total_floor_area"].to_list() == [120.0]
    assert seed["number_habitable_rooms"].to_list() == [4]
    assert seed["latest_price"].to_list() == [750_000]
    # Columns not populated from the listing default to null.
    assert seed["epc_address"].to_list() == [None]
    # Overlay columns flow through 1:1.
    assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]


def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
    tmp_path,
) -> None:
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().with_columns(
        pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"),
        pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
    ).write_parquet(listings_path)
    _stub_arcgis(arcgis_path)

    listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
        pl.lit("1 Example Road").alias("_direct_epc_address"),
        pl.lit("C").alias("_direct_current_energy_rating"),
        pl.lit("B").alias("_direct_potential_energy_rating"),
        pl.lit(98.0).alias("_direct_total_floor_area"),
        pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
        pl.lit(2.4).alias("_direct_floor_height"),
        pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"),
        pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"),
        pl.lit("No").alias("_direct_was_council_house"),
    )
    template_schema = pl.Schema(
        {
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "total_floor_area": pl.Float64,
            "number_habitable_rooms": pl.Int16,
            "epc_address": pl.Utf8,
            "current_energy_rating": pl.Utf8,
            "was_council_house": pl.Utf8,
            **{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
        }
    )

    seed = _build_unmatched_listing_seed_rows(
        listings.select("_listing_idx"), listings, template_schema
    )

    assert seed["total_floor_area"].to_list() == [98.0]
    assert seed["number_habitable_rooms"].to_list() == [4]
    assert seed["epc_address"].to_list() == ["1 Example Road"]
    assert seed["current_energy_rating"].to_list() == ["C"]
    assert seed["was_council_house"].to_list() == ["No"]


def test_match_direct_epc_considers_nearby_postcodes() -> None:
    listing_matches = pl.DataFrame(
        {
            "_listing_idx": [0],
            "_listing_match_address": ["1 EXAMPLE ROAD"],
            "_listing_match_postcode": ["AA11AA"],
            "_listing_east": [1000.0],
            "_listing_north": [1000.0],
            "_actual_property_type": ["Terraced"],
            "_actual_total_floor_area": [100.0],
            "_actual_number_habitable_rooms": [4],
        },
        schema={
            "_listing_idx": pl.UInt32,
            "_listing_match_address": pl.Utf8,
            "_listing_match_postcode": pl.Utf8,
            "_listing_east": pl.Float64,
            "_listing_north": pl.Float64,
            "_actual_property_type": pl.Utf8,
            "_actual_total_floor_area": pl.Float64,
            "_actual_number_habitable_rooms": pl.Int16,
        },
    )
    epc_candidates = pl.DataFrame(
        {
            "_direct_epc_row": [0],
            "_direct_epc_match_address": ["1 EXAMPLE ROAD"],
            "_direct_epc_match_postcode": ["BB11BB"],
            "_direct_epc_east": [1020.0],
            "_direct_epc_north": [1010.0],
            "_direct_epc_canonical_property_type": ["Terraced"],
            "_direct_epc_address": ["1, Example Road"],
            "_direct_current_energy_rating": ["C"],
            "_direct_potential_energy_rating": ["B"],
            "_direct_total_floor_area": [101.0],
            "_direct_number_habitable_rooms": [4],
            "_direct_floor_height": [2.5],
            "_direct_construction_age_band": [1930],
            "_direct_is_construction_date_approximate": [1],
            "_direct_was_council_house": ["No"],
        },
        schema={
            "_direct_epc_row": pl.UInt32,
            "_direct_epc_match_address": pl.Utf8,
            "_direct_epc_match_postcode": pl.Utf8,
            "_direct_epc_east": pl.Float64,
            "_direct_epc_north": pl.Float64,
            "_direct_epc_canonical_property_type": pl.Utf8,
            "_direct_epc_address": pl.Utf8,
            "_direct_current_energy_rating": pl.Utf8,
            "_direct_potential_energy_rating": pl.Utf8,
            "_direct_total_floor_area": pl.Float64,
            "_direct_number_habitable_rooms": pl.Int16,
            "_direct_floor_height": pl.Float64,
            "_direct_construction_age_band": pl.UInt16,
            "_direct_is_construction_date_approximate": pl.UInt8,
            "_direct_was_council_house": pl.Utf8,
        },
    )

    matches = _match_direct_epc(listing_matches, epc_candidates)

    assert matches.height == 1
    assert matches["_listing_idx"].to_list() == [0]
    assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]


def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().write_parquet(listings_path)
    _stub_arcgis(arcgis_path)
    wide = pl.DataFrame(
        {
            "postcode": ["SW1A 1AA", "SW1A 1AA"],
            "pp_address": ["9 Other Road", "1 Example Road"],
            "pp_property_type": ["Detached", "Terraced"],
            "duration": ["Freehold", "Freehold"],
            "total_floor_area": [80.0, 90.0],
            "number_habitable_rooms": [3, 4],
            "latest_price": [500_000, 600_000],
            "epc_address": [None, "1 Example Road"],
            "current_energy_rating": [None, "C"],
            "potential_energy_rating": [None, "B"],
            "floor_height": [None, 2.4],
            "construction_age_band": [None, 1930],
            "is_construction_date_approximate": [None, 1],
            "was_council_house": [None, "No"],
        },
        schema={
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "pp_property_type": pl.Utf8,
            "duration": pl.Utf8,
            "total_floor_area": pl.Float64,
            "number_habitable_rooms": pl.Int16,
            "latest_price": pl.Int64,
            "epc_address": pl.Utf8,
            "current_energy_rating": pl.Utf8,
            "potential_energy_rating": pl.Utf8,
            "floor_height": pl.Float64,
            "construction_age_band": pl.UInt16,
            "is_construction_date_approximate": pl.UInt8,
            "was_council_house": pl.Utf8,
        },
    )

    integrated = _integrate_listings(
        wide.lazy(), listings_path, arcgis_path, epc_path=None
    ).collect()

    matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
    other = integrated.filter(pl.col("pp_address") == "9 Other Road")
    assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
    assert other["_actual_listing_url"].to_list() == [None]


def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None:
    listings_path = tmp_path / "listings.parquet"
    arcgis_path = tmp_path / "arcgis.parquet"
    _sample_listings_frame().with_columns(
        pl.lit("Rose Cottage High Street").alias("Address per Property Register"),
    ).write_parquet(listings_path)
    _stub_arcgis(arcgis_path)
    wide = pl.DataFrame(
        {
            "postcode": ["SW1A 1AA"],
            "pp_address": ["Old Cottage High Street"],
            "pp_property_type": ["Terraced"],
            "duration": ["Freehold"],
            "total_floor_area": [120.0],
            "number_habitable_rooms": [4],
            "latest_price": [750_000],
            "epc_address": ["Old Cottage High Street"],
            "current_energy_rating": ["C"],
            "potential_energy_rating": ["B"],
            "floor_height": [2.4],
            "construction_age_band": [1930],
            "is_construction_date_approximate": [1],
            "was_council_house": ["No"],
        },
        schema={
            "postcode": pl.Utf8,
            "pp_address": pl.Utf8,
            "pp_property_type": pl.Utf8,
            "duration": pl.Utf8,
            "total_floor_area": pl.Float64,
            "number_habitable_rooms": pl.Int16,
            "latest_price": pl.Int64,
            "epc_address": pl.Utf8,
            "current_energy_rating": pl.Utf8,
            "potential_energy_rating": pl.Utf8,
            "floor_height": pl.Float64,
            "construction_age_band": pl.UInt16,
            "is_construction_date_approximate": pl.UInt8,
            "was_council_house": pl.Utf8,
        },
    )

    integrated = _integrate_listings(
        wide.lazy(), listings_path, arcgis_path, epc_path=None
    ).collect()

    existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
    seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street")
    assert existing["_actual_listing_url"].to_list() == [None]
    assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]


def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
    None
):
    df = pl.DataFrame(
        {
            "Postcode": ["SW1A 1AA", "SW1A 1AA"],
            "Address per Property Register": ["1 Example Road", "2 Example Road"],
            "Address per EPC": ["1 Example Road", None],
            "Date of last transaction": [1990.0, None],
            "lat": [51.5, 51.5],
            "lon": [-0.1, -0.1],
            "Total floor area (sqm)": [100.0, 95.0],
            "Number of bedrooms & living rooms": [3, None],
            "Property type": ["Terraced", None],
            "Leasehold/Freehold": ["Leasehold", None],
            "Last known price": [500_000, None],
            "Street tree density percentile": [42.0, 42.0],
            # Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
            "_actual_listing_url": ["url0", "url1"],
            "_actual_asking_price": [600_000, 700_000],
            "_actual_asking_price_per_sqm": [5_000, None],
            "_actual_listing_date": [None, None],
            "_actual_listing_status": ["For sale", "For sale"],
            "_actual_listing_features": [["Garden"], ["Parking"]],
            "_actual_bedrooms": [3, 4],
            "_actual_bathrooms": [1, 2],
            "_actual_price_qualifier": ["", ""],
            "_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"],
            "_actual_lat": [51.51, 51.52],
            "_actual_lon": [-0.11, -0.12],
            "_actual_total_floor_area": [110.0, None],
            "_actual_number_habitable_rooms": [4, 3],
            "_actual_property_type": ["Terraced", "Flats/Maisonettes"],
            "_actual_leasehold_freehold": ["Freehold", "Leasehold"],
        },
        schema={
            "Postcode": pl.Utf8,
            "Address per Property Register": pl.Utf8,
            "Address per EPC": pl.Utf8,
            "Date of last transaction": pl.Float64,
            "lat": pl.Float64,
            "lon": pl.Float64,
            "Total floor area (sqm)": pl.Float64,
            "Number of bedrooms & living rooms": pl.Int16,
            "Property type": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Last known price": pl.Int64,
            "Street tree density percentile": pl.Float32,
            "_actual_listing_url": pl.Utf8,
            "_actual_asking_price": pl.Int64,
            "_actual_asking_price_per_sqm": pl.Int32,
            "_actual_listing_date": pl.Datetime("us"),
            "_actual_listing_status": pl.Utf8,
            "_actual_listing_features": pl.List(pl.Utf8),
            "_actual_bedrooms": pl.Int32,
            "_actual_bathrooms": pl.Int32,
            "_actual_price_qualifier": pl.Utf8,
            "_actual_property_sub_type": pl.Utf8,
            "_actual_lat": pl.Float64,
            "_actual_lon": pl.Float64,
            "_actual_total_floor_area": pl.Float64,
            "_actual_number_habitable_rooms": pl.Int16,
            "_actual_property_type": pl.Utf8,
            "_actual_leasehold_freehold": pl.Utf8,
        },
    )

    finalized = _finalize_listings(df).sort("Address per Property Register")

    assert finalized.height == 2
    assert finalized["Listing URL"].to_list() == ["url0", "url1"]
    assert finalized["Asking price"].to_list() == [600_000, 700_000]
    assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368]
    assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
    assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
    assert finalized["Last known price"].to_list() == [500_000, 700_000]
    # Listing's preferred floor area / rooms / property type / tenure.
    assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
    assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
    assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
    assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
    # Postcode-level feature carried through to both matched and unmatched rows.
    assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
    # Match status reflects historical context availability.
    assert finalized["Historical property match status"].to_list() == [
        "matched",
        "unmatched",
    ]
    # Overlay scaffolding is dropped.
    for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
        assert dst not in finalized.columns, src