scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -15,6 +15,8 @@ from pipeline.transform.merge import (
_finalize_listings,
_integrate_listings,
_match_direct_epc,
_match_listing_properties,
_normalize_uprn,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_load_conservation_area_geometries,
@ -68,6 +70,15 @@ def test_conservation_area_feature_is_area_level() -> None:
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
# Crime is now a raw spatial count per postcode; the per-1k-residents
# variants were dropped along with the LSOA population denominator.
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
def test_listed_building_feature_is_property_level() -> None:
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
@ -471,71 +482,166 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
assert seed["was_council_house"].to_list() == ["No"]
def test_match_direct_epc_considers_nearby_postcodes() -> None:
listing_matches = pl.DataFrame(
{
"_listing_idx": [0],
"_listing_match_address": ["1 EXAMPLE ROAD"],
"_listing_match_postcode": ["AA11AA"],
"_listing_east": [1000.0],
"_listing_north": [1000.0],
"_actual_property_type": ["Terraced"],
"_actual_total_floor_area": [100.0],
"_actual_number_habitable_rooms": [4],
},
schema={
"_listing_idx": pl.UInt32,
"_listing_match_address": pl.Utf8,
"_listing_match_postcode": pl.Utf8,
"_listing_east": pl.Float64,
"_listing_north": pl.Float64,
"_actual_property_type": pl.Utf8,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
},
)
epc_candidates = pl.DataFrame(
{
"_direct_epc_row": [0],
"_direct_epc_match_address": ["1 EXAMPLE ROAD"],
"_direct_epc_match_postcode": ["BB11BB"],
"_direct_epc_east": [1020.0],
"_direct_epc_north": [1010.0],
"_direct_epc_canonical_property_type": ["Terraced"],
"_direct_epc_address": ["1, Example Road"],
"_direct_current_energy_rating": ["C"],
"_direct_potential_energy_rating": ["B"],
"_direct_total_floor_area": [101.0],
"_direct_number_habitable_rooms": [4],
"_direct_floor_height": [2.5],
"_direct_construction_age_band": [1930],
"_direct_is_construction_date_approximate": [1],
"_direct_was_council_house": ["No"],
},
schema={
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_east": pl.Float64,
"_direct_epc_north": pl.Float64,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_address": pl.Utf8,
"_direct_current_energy_rating": pl.Utf8,
"_direct_potential_energy_rating": pl.Utf8,
"_direct_total_floor_area": pl.Float64,
"_direct_number_habitable_rooms": pl.Int16,
"_direct_floor_height": pl.Float64,
"_direct_construction_age_band": pl.UInt16,
"_direct_is_construction_date_approximate": pl.UInt8,
"_direct_was_council_house": pl.Utf8,
},
_DIRECT_EPC_CANDIDATE_SCHEMA = {
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_uprn": pl.Utf8,
"_direct_epc_address": pl.Utf8,
"_direct_current_energy_rating": pl.Utf8,
"_direct_potential_energy_rating": pl.Utf8,
"_direct_total_floor_area": pl.Float64,
"_direct_number_habitable_rooms": pl.Int16,
"_direct_floor_height": pl.Float64,
"_direct_construction_age_band": pl.UInt16,
"_direct_is_construction_date_approximate": pl.UInt8,
"_direct_was_council_house": pl.Utf8,
}
_LISTING_MATCH_SCHEMA = {
"_listing_idx": pl.UInt32,
"_listing_match_address": pl.Utf8,
"_listing_match_postcode": pl.Utf8,
"_listing_uprn": pl.Utf8,
}
def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"_direct_epc_row": 0,
"_direct_epc_match_address": "1 EXAMPLE ROAD",
"_direct_epc_match_postcode": "AA11AA",
"_direct_epc_outcode": "AA1",
"_direct_epc_canonical_property_type": "Terraced",
"_direct_epc_uprn": None,
"_direct_epc_address": "1, Example Road",
"_direct_current_energy_rating": "C",
"_direct_potential_energy_rating": "B",
"_direct_total_floor_area": 101.0,
"_direct_number_habitable_rooms": 4,
"_direct_floor_height": 2.5,
"_direct_construction_age_band": 1930,
"_direct_is_construction_date_approximate": 1,
"_direct_was_council_house": "No",
}
return pl.DataFrame(
[{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
)
matches = _match_direct_epc(listing_matches, epc_candidates)
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
base = {
"_listing_idx": 0,
"_listing_match_address": "1 EXAMPLE ROAD",
"_listing_match_postcode": "AA11AA",
"_listing_uprn": None,
}
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
# UPRN is matched globally (not within a postcode bucket), so a listing
# whose detail-page postcode is slightly off still resolves to the right
# EPC certificate by its UPRN.
matches = _match_direct_epc(
_listing_matches(
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
),
_direct_epc_candidates(
[{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
),
)
assert matches.height == 1
assert matches["_listing_idx"].to_list() == [0]
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
matches = _match_direct_epc(
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
)
assert matches.height == 1
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
assert matches["_direct_epc_match_method"].to_list() == ["address"]
def test_normalize_uprn_handles_types_and_floats() -> None:
assert _normalize_uprn(None) is None
assert _normalize_uprn("") is None
assert _normalize_uprn(" 100012345678 ") == "100012345678"
assert _normalize_uprn(100012345678) == "100012345678"
# An integral float normalises to its digits, NOT "1230".
assert _normalize_uprn(123.0) == "123"
# Non-integral / NaN floats are rejected rather than mangled.
assert _normalize_uprn(1.5) is None
assert _normalize_uprn(float("nan")) is None
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
base = {
"postcode": "AA1 1AA",
"pp_address": "1 Example Road",
"_property_match_postcode": "AA11AA",
"_property_match_address": "1 EXAMPLE ROAD",
"_property_epc_match_address": "1 EXAMPLE ROAD",
"uprn": None,
}
return pl.DataFrame(
[{**base, **row} for row in rows],
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"_property_match_postcode": pl.Utf8,
"_property_match_address": pl.Utf8,
"_property_epc_match_address": pl.Utf8,
"uprn": pl.Utf8,
},
)
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
# Two listings claim the same property: one by UPRN, one by exact address
# (both score 100). The UPRN match must win even though it has the higher
# _listing_idx (which would otherwise break the tie the wrong way).
listings = _listing_matches(
[
{
"_listing_idx": 5,
"_listing_uprn": "100000000001",
"_listing_match_address": "SOMETHING ELSE",
},
{
"_listing_idx": 1,
"_listing_uprn": None,
"_listing_match_address": "1 EXAMPLE ROAD",
},
]
)
matches = _match_listing_properties(
listings, _property_candidates([{"uprn": "100000000001"}])
)
assert matches.height == 1
assert matches["_listing_idx"].to_list() == [5]
assert matches["_property_match_method"].to_list() == ["uprn"]
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
# same-street EPC in a different postcode with no shared UPRN is skipped.
matches = _match_direct_epc(
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
_direct_epc_candidates(
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
),
)
assert matches.height == 0
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
@ -588,11 +694,72 @@ def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -
assert other["_actual_listing_url"].to_list() == [None]
def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None:
def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
# The listing's address deliberately does not match the property's, but the
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Rose Cottage High Street").alias("Address per Property Register"),
pl.lit("Totally Different Road").alias("Address per Property Register"),
pl.lit("100000000009").alias("UPRN"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["1 Example Road"],
"uprn": ["100000000009"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [90.0],
"number_habitable_rooms": [4],
"latest_price": [600_000],
"epc_address": ["1 Example Road"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"uprn": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
# The listing overlay attached to the UPRN-matched property row.
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
# No spurious seed row for the listing's (non-matching) address.
assert "Totally Different Road" not in integrated["pp_address"].to_list()
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
# A number-less listing whose street is not the property's street (and which
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
# row instead of stamping the wrong property's overlay.
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Juniper Crescent").alias("Address per Property Register"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
@ -635,7 +802,7 @@ def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) ->
).collect()
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street")
seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
assert existing["_actual_listing_url"].to_list() == [None]
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
@ -731,3 +898,77 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
# Overlay scaffolding is dropped.
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
assert dst not in finalized.columns, src
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
# The terminated-postcode remap can collapse two distinct wide rows onto the same
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
# must emit one row per listing URL, not one per collapsed wide row.
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "1 Example Road"],
"Address per EPC": ["1 Example Road", "1 Example Road"],
"Date of last transaction": [1990.0, 1995.0],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, 3],
"Property type": ["Terraced", "Terraced"],
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
"Last known price": [500_000, 480_000],
"Street tree density percentile": [42.0, 42.0],
# Same listing URL on both collapsed rows — the fan-out to fix.
"_actual_listing_url": ["url0", "url0"],
"_actual_asking_price": [600_000, 600_000],
"_actual_asking_price_per_sqm": [5_000, 5_000],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Garden"]],
"_actual_bedrooms": [3, 3],
"_actual_bathrooms": [1, 1],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
"_actual_lat": [51.51, 51.51],
"_actual_lon": [-0.11, -0.11],
"_actual_total_floor_area": [110.0, 110.0],
"_actual_number_habitable_rooms": [4, 4],
"_actual_property_type": ["Terraced", "Terraced"],
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df)
assert finalized.height == 1
assert finalized["Listing URL"].to_list() == ["url0"]