1089 lines
40 KiB
Python
1089 lines
40 KiB
Python
import polars as pl
|
|
import pyarrow as pa
|
|
import pytest
|
|
from shapely import box, to_wkb
|
|
from shapely.geometry import Point
|
|
|
|
from pipeline.transform.merge import (
|
|
_AREA_COLUMNS,
|
|
CONSERVATION_AREA_FEATURE,
|
|
LISTED_BUILDING_FEATURE,
|
|
TREE_DENSITY_FEATURE,
|
|
_LISTING_OVERLAY_SOURCES,
|
|
_active_english_postcode_area,
|
|
_build_unmatched_listing_seed_rows,
|
|
_canonical_postcode_expr,
|
|
_filter_to_active_english_postcodes,
|
|
_finalize_listings,
|
|
_integrate_listings,
|
|
_match_direct_epc,
|
|
_match_listing_properties,
|
|
_normalize_uprn,
|
|
_is_dynamic_poi_metric_column,
|
|
_less_deprived_percentile_expr,
|
|
_load_conservation_area_geometries,
|
|
_load_listings_for_merge,
|
|
_matched_listed_building_flags,
|
|
_postcode_conservation_area_flags,
|
|
_postcode_listed_building_candidates,
|
|
_remap_terminated_postcodes,
|
|
_split_normal_outputs,
|
|
_tree_density_by_postcode,
|
|
_validate_lad_source_coverage,
|
|
_validate_postcode_feature_output,
|
|
_validate_property_postcodes,
|
|
)
|
|
|
|
|
|
def test_less_deprived_percentile_expr_preserves_direction_and_nulls() -> None:
|
|
df = pl.DataFrame({"Income Score (rate)": [1.0, 2.0, 3.0, None]})
|
|
|
|
result = (
|
|
df.lazy()
|
|
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
|
|
.collect()
|
|
)
|
|
|
|
assert result["Income Score (rate)"].to_list() == [100.0, 50.0, 0.0, None]
|
|
|
|
|
|
def test_less_deprived_percentile_expr_uses_exact_scale_endpoints() -> None:
|
|
df = pl.DataFrame({"Income Score (rate)": [1.0, 1.0, 2.0, 3.0, 3.0]})
|
|
|
|
result = (
|
|
df.lazy()
|
|
.with_columns(_less_deprived_percentile_expr("Income Score (rate)"))
|
|
.collect()
|
|
)
|
|
|
|
assert result["Income Score (rate)"].to_list() == [100.0, 100.0, 50.0, 0.0, 0.0]
|
|
|
|
|
|
def test_dynamic_poi_metric_columns_are_area_level() -> None:
|
|
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Cafe) (km)")
|
|
assert _is_dynamic_poi_metric_column("Distance to nearest amenity (Park) (km)")
|
|
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 2km")
|
|
assert _is_dynamic_poi_metric_column("Number of amenities (Cafe) within 5km")
|
|
assert not _is_dynamic_poi_metric_column("Number of restaurants within 2km")
|
|
|
|
|
|
def test_country_code_is_kept_in_postcode_area_columns() -> None:
|
|
assert "ctry25cd" in _AREA_COLUMNS
|
|
|
|
|
|
def test_conservation_area_feature_is_area_level() -> None:
|
|
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
|
|
|
|
|
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
|
|
# Crime is now a raw spatial count per postcode; the per-1k-residents
|
|
# variants were dropped along with the LSOA population denominator.
|
|
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
|
|
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
|
|
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
|
|
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
|
|
|
|
|
|
def test_active_english_postcode_area_filters_to_active_england() -> None:
|
|
arcgis = pl.DataFrame(
|
|
{
|
|
"pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"],
|
|
"ctry25cd": ["E92000001", "E92000001", "W92000004"],
|
|
"doterm": [None, "2020-01-01", None],
|
|
"lat": [51.0, 51.1, 52.0],
|
|
"long": [-0.1, -0.2, -3.0],
|
|
"lsoa21cd": ["L1", "L2", "L3"],
|
|
"oa21cd": ["O1", "O2", "O3"],
|
|
"pcon24cd": ["P1", "P2", "P3"],
|
|
}
|
|
)
|
|
|
|
result = _active_english_postcode_area(arcgis.lazy()).collect()
|
|
|
|
assert result.to_dicts() == [
|
|
{
|
|
"postcode": "AA1 1AA",
|
|
"lat": 51.0,
|
|
"lon": -0.1,
|
|
"ctry25cd": "E92000001",
|
|
"lsoa21": "L1",
|
|
"oa21": "O1",
|
|
"pcon": "P1",
|
|
}
|
|
]
|
|
|
|
|
|
def test_remap_then_active_filter_keeps_terminated_english_properties() -> None:
|
|
wide = pl.DataFrame(
|
|
{
|
|
"postcode": ["OLD 1AA", "NEW 1AA", "CF1 1AA"],
|
|
"row_id": [1, 2, 3],
|
|
}
|
|
).lazy()
|
|
mapping = pl.DataFrame(
|
|
{"old_postcode": ["OLD 1AA"], "new_postcode": ["NEW 1AA"]}
|
|
).lazy()
|
|
active_postcodes = pl.DataFrame({"postcode": ["NEW 1AA"]}).lazy()
|
|
|
|
result = (
|
|
_filter_to_active_english_postcodes(
|
|
_remap_terminated_postcodes(wide, mapping), active_postcodes
|
|
)
|
|
.collect()
|
|
.sort("row_id")
|
|
)
|
|
|
|
assert result.to_dicts() == [
|
|
{"postcode": "NEW 1AA", "row_id": 1},
|
|
{"postcode": "NEW 1AA", "row_id": 2},
|
|
]
|
|
|
|
|
|
def test_split_normal_outputs_uses_postcode_feature_universe() -> None:
|
|
df = pl.DataFrame(
|
|
{
|
|
"Postcode": ["AA1 1AA"],
|
|
"Address per Property Register": ["1 Example Road"],
|
|
"Last known price": [250_000],
|
|
"lat": [51.0],
|
|
"lon": [-0.1],
|
|
"ctry25cd": ["E92000001"],
|
|
"lsoa21": ["L1"],
|
|
}
|
|
)
|
|
postcode_features = pl.DataFrame(
|
|
{
|
|
"Postcode": ["AA1 1AA", "BB1 1BB"],
|
|
"lat": [51.0, 52.0],
|
|
"lon": [-0.1, -0.2],
|
|
"ctry25cd": ["E92000001", "E92000001"],
|
|
"lsoa21": ["L1", "L2"],
|
|
"Distance to nearest amenity (Park) (km)": [0.3, 0.8],
|
|
}
|
|
)
|
|
|
|
postcode_df, properties_df = _split_normal_outputs(
|
|
df, postcode_features, expected_postcode_count=2
|
|
)
|
|
|
|
assert postcode_df["Postcode"].to_list() == ["AA1 1AA", "BB1 1BB"]
|
|
assert "Distance to nearest amenity (Park) (km)" in postcode_df.columns
|
|
assert properties_df.to_dicts() == [
|
|
{
|
|
"Postcode": "AA1 1AA",
|
|
"Address per Property Register": "1 Example Road",
|
|
"Last known price": 250_000,
|
|
}
|
|
]
|
|
|
|
|
|
def test_postcode_feature_validation_rejects_unsupported_or_ungeocoded_rows() -> None:
|
|
postcode_df = pl.DataFrame(
|
|
{
|
|
"Postcode": ["AA1 1AA", "CF1 1AA"],
|
|
"lat": [51.0, None],
|
|
"lon": [-0.1, None],
|
|
"ctry25cd": ["E92000001", "W92000004"],
|
|
}
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="unsupported or ungeocoded"):
|
|
_validate_postcode_feature_output(postcode_df, expected_postcode_count=2)
|
|
|
|
|
|
def test_listed_building_feature_is_property_level() -> None:
|
|
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
|
|
|
|
|
def test_postcode_conservation_area_flags_marks_point_membership() -> None:
|
|
postcodes = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "BB1 1BB", "CC1 1CC"],
|
|
"lat": [0.5, 2.0, None],
|
|
"lon": [0.5, 2.0, 0.5],
|
|
}
|
|
)
|
|
|
|
result = _postcode_conservation_area_flags(
|
|
postcodes, [box(0, 0, 1, 1)], "EPSG:4326", batch_size=2
|
|
).sort("postcode")
|
|
|
|
assert result.to_dicts() == [
|
|
{"postcode": "AA1 1AA", CONSERVATION_AREA_FEATURE: "Yes"},
|
|
{"postcode": "BB1 1BB", CONSERVATION_AREA_FEATURE: "No"},
|
|
{"postcode": "CC1 1CC", CONSERVATION_AREA_FEATURE: "No"},
|
|
]
|
|
|
|
|
|
def test_load_conservation_area_geometries_uses_current_planning_data_records(
|
|
monkeypatch: pytest.MonkeyPatch,
|
|
tmp_path,
|
|
) -> None:
|
|
real_area = box(0, 0, 1, 1)
|
|
ended_area = box(2, 2, 3, 3)
|
|
other_dataset_area = box(4, 4, 5, 5)
|
|
point = Point(0.5, 0.5)
|
|
|
|
def fake_read_arrow(path):
|
|
assert path == tmp_path / "conservation_areas.geojson"
|
|
table = pa.table(
|
|
{
|
|
"dataset": [
|
|
"conservation-area",
|
|
"conservation-area",
|
|
"listed-building",
|
|
"conservation-area",
|
|
],
|
|
"end-date": ["", "2025-01-01", "", ""],
|
|
"name": ["Central Village", "Old Boundary", "Other", "Point Record"],
|
|
"SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]),
|
|
}
|
|
)
|
|
return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
|
|
|
|
monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
|
|
|
|
geometries, crs = _load_conservation_area_geometries(
|
|
tmp_path / "conservation_areas.geojson"
|
|
)
|
|
|
|
assert crs == "EPSG:4326"
|
|
assert geometries == [real_area]
|
|
|
|
|
|
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
|
|
listed_points = pl.DataFrame(
|
|
{
|
|
"ListEntry": [1234, 5678],
|
|
"Name": ["1 and 2 High Street", "Distant Hall"],
|
|
"Grade": ["II", "I"],
|
|
"Easting": [100.0, 1000.0],
|
|
"Northing": [100.0, 1000.0],
|
|
}
|
|
).with_columns(
|
|
pl.col("Name")
|
|
.str.to_uppercase()
|
|
.str.replace_all(r"[^0-9A-Z]+", " ")
|
|
.str.replace_all(r"\s+", " ")
|
|
.str.strip_chars()
|
|
.alias("_listed_match_name")
|
|
)
|
|
active_postcodes = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "BB1 1BB"],
|
|
"east1m": [105.0, 5000.0],
|
|
"north1m": [105.0, 5000.0],
|
|
}
|
|
)
|
|
|
|
result = _postcode_listed_building_candidates(
|
|
listed_points,
|
|
active_postcodes,
|
|
nearest_postcodes=1,
|
|
max_distance_m=25,
|
|
)
|
|
|
|
assert result.select("postcode", "_listed_match_name").to_dicts() == [
|
|
{"postcode": "AA1 1AA", "_listed_match_name": "1 AND 2 HIGH STREET"}
|
|
]
|
|
|
|
|
|
def test_matched_listed_building_flags_requires_address_match() -> None:
|
|
properties = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "AA1 1AA", "BB1 1BB"],
|
|
"pp_address": ["1 HIGH STREET", "99 HIGH STREET", "THE OLD RECTORY"],
|
|
"epc_address": ["1, High Street", "99, High Street", "Old Rectory"],
|
|
}
|
|
)
|
|
listed_candidates = pl.DataFrame(
|
|
{
|
|
"postcode": ["AA1 1AA", "BB1 1BB"],
|
|
"_listed_match_name": ["1 AND 2 HIGH STREET", "OLD RECTORY"],
|
|
"_listed_grade": ["II", "II*"],
|
|
"_listed_entry": [1234, 5678],
|
|
}
|
|
)
|
|
|
|
result = _matched_listed_building_flags(
|
|
properties.lazy(), listed_candidates, min_score=95
|
|
).sort("postcode", "pp_address")
|
|
|
|
assert result.to_dicts() == [
|
|
{
|
|
"postcode": "AA1 1AA",
|
|
"pp_address": "1 HIGH STREET",
|
|
LISTED_BUILDING_FEATURE: "Yes",
|
|
},
|
|
{
|
|
"postcode": "BB1 1BB",
|
|
"pp_address": "THE OLD RECTORY",
|
|
LISTED_BUILDING_FEATURE: "Yes",
|
|
},
|
|
]
|
|
|
|
|
|
def test_validate_property_postcodes_rejects_blank_rows() -> None:
|
|
df = pl.DataFrame(
|
|
{
|
|
"Postcode": ["AA1 1AA", ""],
|
|
"Address per Property Register": ["1 Example Street", "2 Example Street"],
|
|
"Last known price": [100_000, 200_000],
|
|
}
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="Property rows missing a postcode"):
|
|
_validate_property_postcodes(df)
|
|
|
|
|
|
def test_validate_lad_source_coverage_allows_only_known_rent_no_data_lads(
|
|
tmp_path,
|
|
) -> None:
|
|
iod_path = tmp_path / "iod.parquet"
|
|
ethnicity_path = tmp_path / "ethnicity.parquet"
|
|
rental_path = tmp_path / "rental.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"Local Authority District code (2024)": [
|
|
"E08000016",
|
|
"E06000053",
|
|
"E09000001",
|
|
],
|
|
"Local Authority District name (2024)": [
|
|
"Barnsley",
|
|
"Isles of Scilly",
|
|
"City of London",
|
|
],
|
|
}
|
|
).write_parquet(iod_path)
|
|
pl.DataFrame(
|
|
{"Geography_code": ["E08000016", "E06000053", "E09000001"]}
|
|
).write_parquet(ethnicity_path)
|
|
pl.DataFrame({"area_code": ["E08000016"], "bedrooms": [1]}).write_parquet(
|
|
rental_path
|
|
)
|
|
|
|
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
|
|
|
|
|
def test_validate_lad_source_coverage_rejects_unexpected_rent_holes(tmp_path) -> None:
|
|
iod_path = tmp_path / "iod.parquet"
|
|
ethnicity_path = tmp_path / "ethnicity.parquet"
|
|
rental_path = tmp_path / "rental.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"Local Authority District code (2024)": ["E08000016"],
|
|
"Local Authority District name (2024)": ["Barnsley"],
|
|
}
|
|
).write_parquet(iod_path)
|
|
pl.DataFrame({"Geography_code": ["E08000016"]}).write_parquet(ethnicity_path)
|
|
pl.DataFrame({"area_code": ["E08000019"], "bedrooms": [1]}).write_parquet(
|
|
rental_path
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="Rental data is missing"):
|
|
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_path)
|
|
|
|
|
|
def test_tree_density_by_postcode_aliases_radius_percentile(tmp_path) -> None:
|
|
path = tmp_path / "tree_density_by_postcode.parquet"
|
|
pl.DataFrame(
|
|
{
|
|
"postcode": ["AB1 2CD", "EF3 4GH"],
|
|
"Tree canopy density percentile within 50m": [12.5, 99.0],
|
|
}
|
|
).write_parquet(path)
|
|
|
|
result = _tree_density_by_postcode(path).collect().sort("postcode")
|
|
|
|
assert result.columns == ["postcode", TREE_DENSITY_FEATURE]
|
|
assert result[TREE_DENSITY_FEATURE].to_list() == [12.5, 99.0]
|
|
assert result.schema[TREE_DENSITY_FEATURE] == pl.Float32
|
|
|
|
|
|
def test_tree_density_by_postcode_requires_postcode_and_density_columns(
|
|
tmp_path,
|
|
) -> None:
|
|
path = tmp_path / "tree_density_by_postcode.parquet"
|
|
pl.DataFrame({"postcode": ["AB1 2CD"], "unrelated": [1.0]}).write_parquet(path)
|
|
|
|
with pytest.raises(ValueError, match="must contain column"):
|
|
_tree_density_by_postcode(path)
|
|
|
|
missing_postcode_path = tmp_path / "missing_postcode.parquet"
|
|
pl.DataFrame({"Tree canopy density percentile within 50m": [12.5]}).write_parquet(
|
|
missing_postcode_path
|
|
)
|
|
|
|
with pytest.raises(ValueError, match="missing required column: postcode"):
|
|
_tree_density_by_postcode(missing_postcode_path)
|
|
|
|
|
|
def _sample_listings_frame() -> pl.DataFrame:
|
|
return pl.DataFrame(
|
|
{
|
|
"Bedrooms": [3],
|
|
"Bathrooms": [2],
|
|
"Number of bedrooms & living rooms": [4],
|
|
"lon": [-0.1],
|
|
"lat": [51.5],
|
|
"Postcode": ["sw1a1aa"],
|
|
"Address per Property Register": ["1 Example Road"],
|
|
"Leasehold/Freehold": ["Freehold"],
|
|
"Property type": ["Terraced"],
|
|
"Property sub-type": ["Mid-Terrace"],
|
|
"Price qualifier": [""],
|
|
"Total floor area (sqm)": [120.0],
|
|
"Listing URL": ["https://example.test/abc"],
|
|
"Listing features": [["Garden", "Off-street parking"]],
|
|
"Listing date": [None],
|
|
"Listing status": ["For sale"],
|
|
"Asking price": [750_000],
|
|
"Asking price per sqm": [6_250],
|
|
},
|
|
schema={
|
|
"Bedrooms": pl.Int32,
|
|
"Bathrooms": pl.Int32,
|
|
"Number of bedrooms & living rooms": pl.Int32,
|
|
"lon": pl.Float64,
|
|
"lat": pl.Float64,
|
|
"Postcode": pl.Utf8,
|
|
"Address per Property Register": pl.Utf8,
|
|
"Leasehold/Freehold": pl.Utf8,
|
|
"Property type": pl.Utf8,
|
|
"Property sub-type": pl.Utf8,
|
|
"Price qualifier": pl.Utf8,
|
|
"Total floor area (sqm)": pl.Float64,
|
|
"Listing URL": pl.Utf8,
|
|
"Listing features": pl.List(pl.Utf8),
|
|
"Listing date": pl.Datetime("us"),
|
|
"Listing status": pl.Utf8,
|
|
"Asking price": pl.Int64,
|
|
"Asking price per sqm": pl.Int32,
|
|
},
|
|
)
|
|
|
|
|
|
def _stub_arcgis(path) -> None:
|
|
pl.DataFrame(
|
|
{
|
|
"pcds": ["SW1A 1AA"],
|
|
"ctry25cd": ["E92000001"],
|
|
"doterm": [None],
|
|
"east1m": [530000.0],
|
|
"north1m": [180000.0],
|
|
},
|
|
schema={
|
|
"pcds": pl.Utf8,
|
|
"ctry25cd": pl.Utf8,
|
|
"doterm": pl.Utf8,
|
|
"east1m": pl.Float64,
|
|
"north1m": pl.Float64,
|
|
},
|
|
).write_parquet(path)
|
|
|
|
|
|
def test_canonical_postcode_expr_formats_compact_postcodes() -> None:
|
|
df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]})
|
|
result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical"))
|
|
assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None]
|
|
|
|
|
|
def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
|
|
tmp_path,
|
|
) -> None:
|
|
listings_path = tmp_path / "listings.parquet"
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
_sample_listings_frame().write_parquet(listings_path)
|
|
_stub_arcgis(arcgis_path)
|
|
|
|
loaded = _load_listings_for_merge(listings_path, arcgis_path)
|
|
|
|
assert loaded["postcode"].to_list() == ["SW1A 1AA"]
|
|
assert loaded["pp_address"].to_list() == ["1 Example Road"]
|
|
assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
|
assert loaded["_actual_asking_price"].to_list() == [750_000]
|
|
assert loaded["_actual_lat"].to_list() == [51.5]
|
|
|
|
|
|
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
|
|
tmp_path,
|
|
) -> None:
|
|
listings_path = tmp_path / "listings.parquet"
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
_sample_listings_frame().write_parquet(listings_path)
|
|
_stub_arcgis(arcgis_path)
|
|
|
|
listings = _load_listings_for_merge(listings_path, arcgis_path)
|
|
template_schema = pl.Schema(
|
|
{
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
"pp_property_type": pl.Utf8,
|
|
"duration": pl.Utf8,
|
|
"total_floor_area": pl.Float64,
|
|
"number_habitable_rooms": pl.Int16,
|
|
"latest_price": pl.Int64,
|
|
"epc_address": pl.Utf8,
|
|
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
|
|
}
|
|
)
|
|
unmatched_idxs = listings.select("_listing_idx")
|
|
|
|
seed = _build_unmatched_listing_seed_rows(unmatched_idxs, listings, template_schema)
|
|
|
|
assert seed.height == 1
|
|
assert seed["postcode"].to_list() == ["SW1A 1AA"]
|
|
assert seed["pp_address"].to_list() == ["1 Example Road"]
|
|
assert seed["pp_property_type"].to_list() == ["Terraced"]
|
|
assert seed["duration"].to_list() == ["Freehold"]
|
|
assert seed["total_floor_area"].to_list() == [120.0]
|
|
assert seed["number_habitable_rooms"].to_list() == [4]
|
|
assert seed["latest_price"].to_list() == [750_000]
|
|
# Columns not populated from the listing default to null.
|
|
assert seed["epc_address"].to_list() == [None]
|
|
# Overlay columns flow through 1:1.
|
|
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
|
|
|
|
|
def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
|
|
tmp_path,
|
|
) -> None:
|
|
listings_path = tmp_path / "listings.parquet"
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
_sample_listings_frame().with_columns(
|
|
pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"),
|
|
pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
|
|
).write_parquet(listings_path)
|
|
_stub_arcgis(arcgis_path)
|
|
|
|
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
|
|
pl.lit("1 Example Road").alias("_direct_epc_address"),
|
|
pl.lit("C").alias("_direct_current_energy_rating"),
|
|
pl.lit("B").alias("_direct_potential_energy_rating"),
|
|
pl.lit(98.0).alias("_direct_total_floor_area"),
|
|
pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
|
|
pl.lit(2.4).alias("_direct_floor_height"),
|
|
pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"),
|
|
pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"),
|
|
pl.lit("No").alias("_direct_was_council_house"),
|
|
)
|
|
template_schema = pl.Schema(
|
|
{
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
"total_floor_area": pl.Float64,
|
|
"number_habitable_rooms": pl.Int16,
|
|
"epc_address": pl.Utf8,
|
|
"current_energy_rating": pl.Utf8,
|
|
"was_council_house": pl.Utf8,
|
|
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
|
|
}
|
|
)
|
|
|
|
seed = _build_unmatched_listing_seed_rows(
|
|
listings.select("_listing_idx"), listings, template_schema
|
|
)
|
|
|
|
assert seed["total_floor_area"].to_list() == [98.0]
|
|
assert seed["number_habitable_rooms"].to_list() == [4]
|
|
assert seed["epc_address"].to_list() == ["1 Example Road"]
|
|
assert seed["current_energy_rating"].to_list() == ["C"]
|
|
assert seed["was_council_house"].to_list() == ["No"]
|
|
|
|
|
|
_DIRECT_EPC_CANDIDATE_SCHEMA = {
|
|
"_direct_epc_row": pl.UInt32,
|
|
"_direct_epc_match_address": pl.Utf8,
|
|
"_direct_epc_match_postcode": pl.Utf8,
|
|
"_direct_epc_outcode": pl.Utf8,
|
|
"_direct_epc_canonical_property_type": pl.Utf8,
|
|
"_direct_epc_uprn": pl.Utf8,
|
|
"_direct_epc_address": pl.Utf8,
|
|
"_direct_current_energy_rating": pl.Utf8,
|
|
"_direct_potential_energy_rating": pl.Utf8,
|
|
"_direct_total_floor_area": pl.Float64,
|
|
"_direct_number_habitable_rooms": pl.Int16,
|
|
"_direct_floor_height": pl.Float64,
|
|
"_direct_construction_age_band": pl.UInt16,
|
|
"_direct_is_construction_date_approximate": pl.UInt8,
|
|
"_direct_was_council_house": pl.Utf8,
|
|
}
|
|
|
|
_LISTING_MATCH_SCHEMA = {
|
|
"_listing_idx": pl.UInt32,
|
|
"_listing_match_address": pl.Utf8,
|
|
"_listing_match_postcode": pl.Utf8,
|
|
"_listing_uprn": pl.Utf8,
|
|
}
|
|
|
|
|
|
def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
|
|
base = {
|
|
"_direct_epc_row": 0,
|
|
"_direct_epc_match_address": "1 EXAMPLE ROAD",
|
|
"_direct_epc_match_postcode": "AA11AA",
|
|
"_direct_epc_outcode": "AA1",
|
|
"_direct_epc_canonical_property_type": "Terraced",
|
|
"_direct_epc_uprn": None,
|
|
"_direct_epc_address": "1, Example Road",
|
|
"_direct_current_energy_rating": "C",
|
|
"_direct_potential_energy_rating": "B",
|
|
"_direct_total_floor_area": 101.0,
|
|
"_direct_number_habitable_rooms": 4,
|
|
"_direct_floor_height": 2.5,
|
|
"_direct_construction_age_band": 1930,
|
|
"_direct_is_construction_date_approximate": 1,
|
|
"_direct_was_council_house": "No",
|
|
}
|
|
return pl.DataFrame(
|
|
[{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
|
|
)
|
|
|
|
|
|
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
|
|
base = {
|
|
"_listing_idx": 0,
|
|
"_listing_match_address": "1 EXAMPLE ROAD",
|
|
"_listing_match_postcode": "AA11AA",
|
|
"_listing_uprn": None,
|
|
}
|
|
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
|
|
|
|
|
|
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
|
|
# UPRN is matched globally (not within a postcode bucket), so a listing
|
|
# whose detail-page postcode is slightly off still resolves to the right
|
|
# EPC certificate by its UPRN.
|
|
matches = _match_direct_epc(
|
|
_listing_matches(
|
|
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
|
|
),
|
|
_direct_epc_candidates(
|
|
[
|
|
{
|
|
"_direct_epc_uprn": "100000000001",
|
|
"_direct_epc_match_postcode": "AA11AA",
|
|
}
|
|
]
|
|
),
|
|
)
|
|
|
|
assert matches.height == 1
|
|
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
|
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
|
|
|
|
|
|
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
|
|
matches = _match_direct_epc(
|
|
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
|
|
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
|
|
)
|
|
|
|
assert matches.height == 1
|
|
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
|
assert matches["_direct_epc_match_method"].to_list() == ["address"]
|
|
|
|
|
|
def test_normalize_uprn_handles_types_and_floats() -> None:
|
|
assert _normalize_uprn(None) is None
|
|
assert _normalize_uprn("") is None
|
|
assert _normalize_uprn(" 100012345678 ") == "100012345678"
|
|
assert _normalize_uprn(100012345678) == "100012345678"
|
|
# An integral float normalises to its digits, NOT "1230".
|
|
assert _normalize_uprn(123.0) == "123"
|
|
# Non-integral / NaN floats are rejected rather than mangled.
|
|
assert _normalize_uprn(1.5) is None
|
|
assert _normalize_uprn(float("nan")) is None
|
|
|
|
|
|
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
|
base = {
|
|
"postcode": "AA1 1AA",
|
|
"pp_address": "1 Example Road",
|
|
"_property_match_postcode": "AA11AA",
|
|
"_property_match_address": "1 EXAMPLE ROAD",
|
|
"_property_epc_match_address": "1 EXAMPLE ROAD",
|
|
"uprn": None,
|
|
}
|
|
return pl.DataFrame(
|
|
[{**base, **row} for row in rows],
|
|
schema={
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
"_property_match_postcode": pl.Utf8,
|
|
"_property_match_address": pl.Utf8,
|
|
"_property_epc_match_address": pl.Utf8,
|
|
"uprn": pl.Utf8,
|
|
},
|
|
)
|
|
|
|
|
|
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
|
|
# Two listings claim the same property: one by UPRN, one by exact address
|
|
# (both score 100). The UPRN match must win even though it has the higher
|
|
# _listing_idx (which would otherwise break the tie the wrong way).
|
|
listings = _listing_matches(
|
|
[
|
|
{
|
|
"_listing_idx": 5,
|
|
"_listing_uprn": "100000000001",
|
|
"_listing_match_address": "SOMETHING ELSE",
|
|
},
|
|
{
|
|
"_listing_idx": 1,
|
|
"_listing_uprn": None,
|
|
"_listing_match_address": "1 EXAMPLE ROAD",
|
|
},
|
|
]
|
|
)
|
|
matches = _match_listing_properties(
|
|
listings, _property_candidates([{"uprn": "100000000001"}])
|
|
)
|
|
|
|
assert matches.height == 1
|
|
assert matches["_listing_idx"].to_list() == [5]
|
|
assert matches["_property_match_method"].to_list() == ["uprn"]
|
|
|
|
|
|
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
|
|
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
|
|
# same-street EPC in a different postcode with no shared UPRN is skipped.
|
|
matches = _match_direct_epc(
|
|
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
|
|
_direct_epc_candidates(
|
|
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
|
|
),
|
|
)
|
|
|
|
assert matches.height == 0
|
|
|
|
|
|
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
|
|
listings_path = tmp_path / "listings.parquet"
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
_sample_listings_frame().write_parquet(listings_path)
|
|
_stub_arcgis(arcgis_path)
|
|
wide = pl.DataFrame(
|
|
{
|
|
"postcode": ["SW1A 1AA", "SW1A 1AA"],
|
|
"pp_address": ["9 Other Road", "1 Example Road"],
|
|
"pp_property_type": ["Detached", "Terraced"],
|
|
"duration": ["Freehold", "Freehold"],
|
|
"total_floor_area": [80.0, 90.0],
|
|
"number_habitable_rooms": [3, 4],
|
|
"latest_price": [500_000, 600_000],
|
|
"epc_address": [None, "1 Example Road"],
|
|
"current_energy_rating": [None, "C"],
|
|
"potential_energy_rating": [None, "B"],
|
|
"floor_height": [None, 2.4],
|
|
"construction_age_band": [None, 1930],
|
|
"is_construction_date_approximate": [None, 1],
|
|
"was_council_house": [None, "No"],
|
|
},
|
|
schema={
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
"pp_property_type": pl.Utf8,
|
|
"duration": pl.Utf8,
|
|
"total_floor_area": pl.Float64,
|
|
"number_habitable_rooms": pl.Int16,
|
|
"latest_price": pl.Int64,
|
|
"epc_address": pl.Utf8,
|
|
"current_energy_rating": pl.Utf8,
|
|
"potential_energy_rating": pl.Utf8,
|
|
"floor_height": pl.Float64,
|
|
"construction_age_band": pl.UInt16,
|
|
"is_construction_date_approximate": pl.UInt8,
|
|
"was_council_house": pl.Utf8,
|
|
},
|
|
)
|
|
|
|
integrated = _integrate_listings(
|
|
wide.lazy(), listings_path, arcgis_path, epc_path=None
|
|
).collect()
|
|
|
|
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
|
|
other = integrated.filter(pl.col("pp_address") == "9 Other Road")
|
|
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
|
assert other["_actual_listing_url"].to_list() == [None]
|
|
|
|
|
|
def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
|
|
# The listing's address deliberately does not match the property's, but the
|
|
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
|
|
listings_path = tmp_path / "listings.parquet"
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
_sample_listings_frame().with_columns(
|
|
pl.lit("Totally Different Road").alias("Address per Property Register"),
|
|
pl.lit("100000000009").alias("UPRN"),
|
|
).write_parquet(listings_path)
|
|
_stub_arcgis(arcgis_path)
|
|
wide = pl.DataFrame(
|
|
{
|
|
"postcode": ["SW1A 1AA"],
|
|
"pp_address": ["1 Example Road"],
|
|
"uprn": ["100000000009"],
|
|
"pp_property_type": ["Terraced"],
|
|
"duration": ["Freehold"],
|
|
"total_floor_area": [90.0],
|
|
"number_habitable_rooms": [4],
|
|
"latest_price": [600_000],
|
|
"epc_address": ["1 Example Road"],
|
|
"current_energy_rating": ["C"],
|
|
"potential_energy_rating": ["B"],
|
|
"floor_height": [2.4],
|
|
"construction_age_band": [1930],
|
|
"is_construction_date_approximate": [1],
|
|
"was_council_house": ["No"],
|
|
},
|
|
schema={
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
"uprn": pl.Utf8,
|
|
"pp_property_type": pl.Utf8,
|
|
"duration": pl.Utf8,
|
|
"total_floor_area": pl.Float64,
|
|
"number_habitable_rooms": pl.Int16,
|
|
"latest_price": pl.Int64,
|
|
"epc_address": pl.Utf8,
|
|
"current_energy_rating": pl.Utf8,
|
|
"potential_energy_rating": pl.Utf8,
|
|
"floor_height": pl.Float64,
|
|
"construction_age_band": pl.UInt16,
|
|
"is_construction_date_approximate": pl.UInt8,
|
|
"was_council_house": pl.Utf8,
|
|
},
|
|
)
|
|
|
|
integrated = _integrate_listings(
|
|
wide.lazy(), listings_path, arcgis_path, epc_path=None
|
|
).collect()
|
|
|
|
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
|
|
# The listing overlay attached to the UPRN-matched property row.
|
|
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
|
# No spurious seed row for the listing's (non-matching) address.
|
|
assert "Totally Different Road" not in integrated["pp_address"].to_list()
|
|
|
|
|
|
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
|
|
# A number-less listing whose street is not the property's street (and which
|
|
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
|
|
# row instead of stamping the wrong property's overlay.
|
|
listings_path = tmp_path / "listings.parquet"
|
|
arcgis_path = tmp_path / "arcgis.parquet"
|
|
_sample_listings_frame().with_columns(
|
|
pl.lit("Juniper Crescent").alias("Address per Property Register"),
|
|
).write_parquet(listings_path)
|
|
_stub_arcgis(arcgis_path)
|
|
wide = pl.DataFrame(
|
|
{
|
|
"postcode": ["SW1A 1AA"],
|
|
"pp_address": ["Old Cottage High Street"],
|
|
"pp_property_type": ["Terraced"],
|
|
"duration": ["Freehold"],
|
|
"total_floor_area": [120.0],
|
|
"number_habitable_rooms": [4],
|
|
"latest_price": [750_000],
|
|
"epc_address": ["Old Cottage High Street"],
|
|
"current_energy_rating": ["C"],
|
|
"potential_energy_rating": ["B"],
|
|
"floor_height": [2.4],
|
|
"construction_age_band": [1930],
|
|
"is_construction_date_approximate": [1],
|
|
"was_council_house": ["No"],
|
|
},
|
|
schema={
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
"pp_property_type": pl.Utf8,
|
|
"duration": pl.Utf8,
|
|
"total_floor_area": pl.Float64,
|
|
"number_habitable_rooms": pl.Int16,
|
|
"latest_price": pl.Int64,
|
|
"epc_address": pl.Utf8,
|
|
"current_energy_rating": pl.Utf8,
|
|
"potential_energy_rating": pl.Utf8,
|
|
"floor_height": pl.Float64,
|
|
"construction_age_band": pl.UInt16,
|
|
"is_construction_date_approximate": pl.UInt8,
|
|
"was_council_house": pl.Utf8,
|
|
},
|
|
)
|
|
|
|
integrated = _integrate_listings(
|
|
wide.lazy(), listings_path, arcgis_path, epc_path=None
|
|
).collect()
|
|
|
|
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
|
|
seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
|
|
assert existing["_actual_listing_url"].to_list() == [None]
|
|
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
|
|
|
|
|
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
|
|
None
|
|
):
|
|
df = pl.DataFrame(
|
|
{
|
|
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
|
|
"Address per Property Register": ["1 Example Road", "2 Example Road"],
|
|
"Address per EPC": ["1 Example Road", None],
|
|
"Date of last transaction": [1990.0, None],
|
|
"lat": [51.5, 51.5],
|
|
"lon": [-0.1, -0.1],
|
|
"Total floor area (sqm)": [100.0, 95.0],
|
|
"Number of bedrooms & living rooms": [3, None],
|
|
"Property type": ["Terraced", None],
|
|
"Leasehold/Freehold": ["Leasehold", None],
|
|
"Last known price": [500_000, None],
|
|
"Street tree density percentile": [42.0, 42.0],
|
|
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
|
|
"_actual_listing_url": ["url0", "url1"],
|
|
"_actual_asking_price": [600_000, 700_000],
|
|
"_actual_asking_price_per_sqm": [5_000, None],
|
|
"_actual_listing_date": [None, None],
|
|
"_actual_listing_status": ["For sale", "For sale"],
|
|
"_actual_listing_features": [["Garden"], ["Parking"]],
|
|
"_actual_bedrooms": [3, 4],
|
|
"_actual_bathrooms": [1, 2],
|
|
"_actual_price_qualifier": ["", ""],
|
|
"_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"],
|
|
"_actual_lat": [51.51, 51.52],
|
|
"_actual_lon": [-0.11, -0.12],
|
|
"_actual_total_floor_area": [110.0, None],
|
|
"_actual_number_habitable_rooms": [4, 3],
|
|
"_actual_property_type": ["Terraced", "Flats/Maisonettes"],
|
|
"_actual_leasehold_freehold": ["Freehold", "Leasehold"],
|
|
},
|
|
schema={
|
|
"Postcode": pl.Utf8,
|
|
"Address per Property Register": pl.Utf8,
|
|
"Address per EPC": pl.Utf8,
|
|
"Date of last transaction": pl.Float64,
|
|
"lat": pl.Float64,
|
|
"lon": pl.Float64,
|
|
"Total floor area (sqm)": pl.Float64,
|
|
"Number of bedrooms & living rooms": pl.Int16,
|
|
"Property type": pl.Utf8,
|
|
"Leasehold/Freehold": pl.Utf8,
|
|
"Last known price": pl.Int64,
|
|
"Street tree density percentile": pl.Float32,
|
|
"_actual_listing_url": pl.Utf8,
|
|
"_actual_asking_price": pl.Int64,
|
|
"_actual_asking_price_per_sqm": pl.Int32,
|
|
"_actual_listing_date": pl.Datetime("us"),
|
|
"_actual_listing_status": pl.Utf8,
|
|
"_actual_listing_features": pl.List(pl.Utf8),
|
|
"_actual_bedrooms": pl.Int32,
|
|
"_actual_bathrooms": pl.Int32,
|
|
"_actual_price_qualifier": pl.Utf8,
|
|
"_actual_property_sub_type": pl.Utf8,
|
|
"_actual_lat": pl.Float64,
|
|
"_actual_lon": pl.Float64,
|
|
"_actual_total_floor_area": pl.Float64,
|
|
"_actual_number_habitable_rooms": pl.Int16,
|
|
"_actual_property_type": pl.Utf8,
|
|
"_actual_leasehold_freehold": pl.Utf8,
|
|
},
|
|
)
|
|
|
|
finalized = _finalize_listings(df).sort("Address per Property Register")
|
|
|
|
assert finalized.height == 2
|
|
assert finalized["Listing URL"].to_list() == ["url0", "url1"]
|
|
assert finalized["Asking price"].to_list() == [600_000, 700_000]
|
|
assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368]
|
|
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
|
|
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
|
|
assert finalized["Last known price"].to_list() == [500_000, 700_000]
|
|
# Listing's preferred floor area / rooms / property type / tenure.
|
|
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
|
|
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
|
|
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
|
|
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
|
|
# Postcode-level feature carried through to both matched and unmatched rows.
|
|
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
|
|
# Match status reflects historical context availability.
|
|
assert finalized["Historical property match status"].to_list() == [
|
|
"matched",
|
|
"unmatched",
|
|
]
|
|
# Overlay scaffolding is dropped.
|
|
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
|
|
assert dst not in finalized.columns, src
|
|
|
|
|
|
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
|
# The terminated-postcode remap can collapse two distinct wide rows onto the same
|
|
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
|
|
# must emit one row per listing URL, not one per collapsed wide row.
|
|
df = pl.DataFrame(
|
|
{
|
|
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
|
|
"Address per Property Register": ["1 Example Road", "1 Example Road"],
|
|
"Address per EPC": ["1 Example Road", "1 Example Road"],
|
|
"Date of last transaction": [1990.0, 1995.0],
|
|
"lat": [51.5, 51.5],
|
|
"lon": [-0.1, -0.1],
|
|
"Total floor area (sqm)": [100.0, 95.0],
|
|
"Number of bedrooms & living rooms": [3, 3],
|
|
"Property type": ["Terraced", "Terraced"],
|
|
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
|
|
"Last known price": [500_000, 480_000],
|
|
"Street tree density percentile": [42.0, 42.0],
|
|
# Same listing URL on both collapsed rows — the fan-out to fix.
|
|
"_actual_listing_url": ["url0", "url0"],
|
|
"_actual_asking_price": [600_000, 600_000],
|
|
"_actual_asking_price_per_sqm": [5_000, 5_000],
|
|
"_actual_listing_date": [None, None],
|
|
"_actual_listing_status": ["For sale", "For sale"],
|
|
"_actual_listing_features": [["Garden"], ["Garden"]],
|
|
"_actual_bedrooms": [3, 3],
|
|
"_actual_bathrooms": [1, 1],
|
|
"_actual_price_qualifier": ["", ""],
|
|
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
|
|
"_actual_lat": [51.51, 51.51],
|
|
"_actual_lon": [-0.11, -0.11],
|
|
"_actual_total_floor_area": [110.0, 110.0],
|
|
"_actual_number_habitable_rooms": [4, 4],
|
|
"_actual_property_type": ["Terraced", "Terraced"],
|
|
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
|
|
},
|
|
schema={
|
|
"Postcode": pl.Utf8,
|
|
"Address per Property Register": pl.Utf8,
|
|
"Address per EPC": pl.Utf8,
|
|
"Date of last transaction": pl.Float64,
|
|
"lat": pl.Float64,
|
|
"lon": pl.Float64,
|
|
"Total floor area (sqm)": pl.Float64,
|
|
"Number of bedrooms & living rooms": pl.Int16,
|
|
"Property type": pl.Utf8,
|
|
"Leasehold/Freehold": pl.Utf8,
|
|
"Last known price": pl.Int64,
|
|
"Street tree density percentile": pl.Float32,
|
|
"_actual_listing_url": pl.Utf8,
|
|
"_actual_asking_price": pl.Int64,
|
|
"_actual_asking_price_per_sqm": pl.Int32,
|
|
"_actual_listing_date": pl.Datetime("us"),
|
|
"_actual_listing_status": pl.Utf8,
|
|
"_actual_listing_features": pl.List(pl.Utf8),
|
|
"_actual_bedrooms": pl.Int32,
|
|
"_actual_bathrooms": pl.Int32,
|
|
"_actual_price_qualifier": pl.Utf8,
|
|
"_actual_property_sub_type": pl.Utf8,
|
|
"_actual_lat": pl.Float64,
|
|
"_actual_lon": pl.Float64,
|
|
"_actual_total_floor_area": pl.Float64,
|
|
"_actual_number_habitable_rooms": pl.Int16,
|
|
"_actual_property_type": pl.Utf8,
|
|
"_actual_leasehold_freehold": pl.Utf8,
|
|
},
|
|
)
|
|
|
|
finalized = _finalize_listings(df)
|
|
|
|
assert finalized.height == 1
|
|
assert finalized["Listing URL"].to_list() == ["url0"]
|