scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -15,6 +15,8 @@ from pipeline.transform.merge import (
|
|||
_finalize_listings,
|
||||
_integrate_listings,
|
||||
_match_direct_epc,
|
||||
_match_listing_properties,
|
||||
_normalize_uprn,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
_load_conservation_area_geometries,
|
||||
|
|
@ -68,6 +70,15 @@ def test_conservation_area_feature_is_area_level() -> None:
|
|||
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
|
||||
# Crime is now a raw spatial count per postcode; the per-1k-residents
|
||||
# variants were dropped along with the LSOA population denominator.
|
||||
assert "Serious crime (avg/yr)" in _AREA_COLUMNS
|
||||
assert "Minor crime (avg/yr)" in _AREA_COLUMNS
|
||||
assert "Serious crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
|
||||
assert "Minor crime per 1k residents (avg/yr)" not in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_listed_building_feature_is_property_level() -> None:
|
||||
assert LISTED_BUILDING_FEATURE not in _AREA_COLUMNS
|
||||
|
||||
|
|
@ -471,71 +482,166 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
|
|||
assert seed["was_council_house"].to_list() == ["No"]
|
||||
|
||||
|
||||
def test_match_direct_epc_considers_nearby_postcodes() -> None:
|
||||
listing_matches = pl.DataFrame(
|
||||
{
|
||||
"_listing_idx": [0],
|
||||
"_listing_match_address": ["1 EXAMPLE ROAD"],
|
||||
"_listing_match_postcode": ["AA11AA"],
|
||||
"_listing_east": [1000.0],
|
||||
"_listing_north": [1000.0],
|
||||
"_actual_property_type": ["Terraced"],
|
||||
"_actual_total_floor_area": [100.0],
|
||||
"_actual_number_habitable_rooms": [4],
|
||||
},
|
||||
schema={
|
||||
"_listing_idx": pl.UInt32,
|
||||
"_listing_match_address": pl.Utf8,
|
||||
"_listing_match_postcode": pl.Utf8,
|
||||
"_listing_east": pl.Float64,
|
||||
"_listing_north": pl.Float64,
|
||||
"_actual_property_type": pl.Utf8,
|
||||
"_actual_total_floor_area": pl.Float64,
|
||||
"_actual_number_habitable_rooms": pl.Int16,
|
||||
},
|
||||
)
|
||||
epc_candidates = pl.DataFrame(
|
||||
{
|
||||
"_direct_epc_row": [0],
|
||||
"_direct_epc_match_address": ["1 EXAMPLE ROAD"],
|
||||
"_direct_epc_match_postcode": ["BB11BB"],
|
||||
"_direct_epc_east": [1020.0],
|
||||
"_direct_epc_north": [1010.0],
|
||||
"_direct_epc_canonical_property_type": ["Terraced"],
|
||||
"_direct_epc_address": ["1, Example Road"],
|
||||
"_direct_current_energy_rating": ["C"],
|
||||
"_direct_potential_energy_rating": ["B"],
|
||||
"_direct_total_floor_area": [101.0],
|
||||
"_direct_number_habitable_rooms": [4],
|
||||
"_direct_floor_height": [2.5],
|
||||
"_direct_construction_age_band": [1930],
|
||||
"_direct_is_construction_date_approximate": [1],
|
||||
"_direct_was_council_house": ["No"],
|
||||
},
|
||||
schema={
|
||||
"_direct_epc_row": pl.UInt32,
|
||||
"_direct_epc_match_address": pl.Utf8,
|
||||
"_direct_epc_match_postcode": pl.Utf8,
|
||||
"_direct_epc_east": pl.Float64,
|
||||
"_direct_epc_north": pl.Float64,
|
||||
"_direct_epc_canonical_property_type": pl.Utf8,
|
||||
"_direct_epc_address": pl.Utf8,
|
||||
"_direct_current_energy_rating": pl.Utf8,
|
||||
"_direct_potential_energy_rating": pl.Utf8,
|
||||
"_direct_total_floor_area": pl.Float64,
|
||||
"_direct_number_habitable_rooms": pl.Int16,
|
||||
"_direct_floor_height": pl.Float64,
|
||||
"_direct_construction_age_band": pl.UInt16,
|
||||
"_direct_is_construction_date_approximate": pl.UInt8,
|
||||
"_direct_was_council_house": pl.Utf8,
|
||||
},
|
||||
_DIRECT_EPC_CANDIDATE_SCHEMA = {
|
||||
"_direct_epc_row": pl.UInt32,
|
||||
"_direct_epc_match_address": pl.Utf8,
|
||||
"_direct_epc_match_postcode": pl.Utf8,
|
||||
"_direct_epc_outcode": pl.Utf8,
|
||||
"_direct_epc_canonical_property_type": pl.Utf8,
|
||||
"_direct_epc_uprn": pl.Utf8,
|
||||
"_direct_epc_address": pl.Utf8,
|
||||
"_direct_current_energy_rating": pl.Utf8,
|
||||
"_direct_potential_energy_rating": pl.Utf8,
|
||||
"_direct_total_floor_area": pl.Float64,
|
||||
"_direct_number_habitable_rooms": pl.Int16,
|
||||
"_direct_floor_height": pl.Float64,
|
||||
"_direct_construction_age_band": pl.UInt16,
|
||||
"_direct_is_construction_date_approximate": pl.UInt8,
|
||||
"_direct_was_council_house": pl.Utf8,
|
||||
}
|
||||
|
||||
_LISTING_MATCH_SCHEMA = {
|
||||
"_listing_idx": pl.UInt32,
|
||||
"_listing_match_address": pl.Utf8,
|
||||
"_listing_match_postcode": pl.Utf8,
|
||||
"_listing_uprn": pl.Utf8,
|
||||
}
|
||||
|
||||
|
||||
def _direct_epc_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"_direct_epc_row": 0,
|
||||
"_direct_epc_match_address": "1 EXAMPLE ROAD",
|
||||
"_direct_epc_match_postcode": "AA11AA",
|
||||
"_direct_epc_outcode": "AA1",
|
||||
"_direct_epc_canonical_property_type": "Terraced",
|
||||
"_direct_epc_uprn": None,
|
||||
"_direct_epc_address": "1, Example Road",
|
||||
"_direct_current_energy_rating": "C",
|
||||
"_direct_potential_energy_rating": "B",
|
||||
"_direct_total_floor_area": 101.0,
|
||||
"_direct_number_habitable_rooms": 4,
|
||||
"_direct_floor_height": 2.5,
|
||||
"_direct_construction_age_band": 1930,
|
||||
"_direct_is_construction_date_approximate": 1,
|
||||
"_direct_was_council_house": "No",
|
||||
}
|
||||
return pl.DataFrame(
|
||||
[{**base, **row} for row in rows], schema=_DIRECT_EPC_CANDIDATE_SCHEMA
|
||||
)
|
||||
|
||||
matches = _match_direct_epc(listing_matches, epc_candidates)
|
||||
|
||||
def _listing_matches(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"_listing_idx": 0,
|
||||
"_listing_match_address": "1 EXAMPLE ROAD",
|
||||
"_listing_match_postcode": "AA11AA",
|
||||
"_listing_uprn": None,
|
||||
}
|
||||
return pl.DataFrame([{**base, **row} for row in rows], schema=_LISTING_MATCH_SCHEMA)
|
||||
|
||||
|
||||
def test_match_direct_epc_matches_by_uprn_across_postcodes() -> None:
|
||||
# UPRN is matched globally (not within a postcode bucket), so a listing
|
||||
# whose detail-page postcode is slightly off still resolves to the right
|
||||
# EPC certificate by its UPRN.
|
||||
matches = _match_direct_epc(
|
||||
_listing_matches(
|
||||
[{"_listing_uprn": "100000000001", "_listing_match_postcode": "ZZ99ZZ"}]
|
||||
),
|
||||
_direct_epc_candidates(
|
||||
[{"_direct_epc_uprn": "100000000001", "_direct_epc_match_postcode": "AA11AA"}]
|
||||
),
|
||||
)
|
||||
|
||||
assert matches.height == 1
|
||||
assert matches["_listing_idx"].to_list() == [0]
|
||||
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
||||
assert matches["_direct_epc_match_method"].to_list() == ["uprn"]
|
||||
|
||||
|
||||
def test_match_direct_epc_matches_by_address_in_same_postcode() -> None:
|
||||
matches = _match_direct_epc(
|
||||
_listing_matches([{"_listing_match_address": "1 EXAMPLE ROAD"}]),
|
||||
_direct_epc_candidates([{"_direct_epc_match_address": "1 EXAMPLE ROAD"}]),
|
||||
)
|
||||
|
||||
assert matches.height == 1
|
||||
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
|
||||
assert matches["_direct_epc_match_method"].to_list() == ["address"]
|
||||
|
||||
|
||||
def test_normalize_uprn_handles_types_and_floats() -> None:
|
||||
assert _normalize_uprn(None) is None
|
||||
assert _normalize_uprn("") is None
|
||||
assert _normalize_uprn(" 100012345678 ") == "100012345678"
|
||||
assert _normalize_uprn(100012345678) == "100012345678"
|
||||
# An integral float normalises to its digits, NOT "1230".
|
||||
assert _normalize_uprn(123.0) == "123"
|
||||
# Non-integral / NaN floats are rejected rather than mangled.
|
||||
assert _normalize_uprn(1.5) is None
|
||||
assert _normalize_uprn(float("nan")) is None
|
||||
|
||||
|
||||
def _property_candidates(rows: list[dict]) -> pl.DataFrame:
|
||||
base = {
|
||||
"postcode": "AA1 1AA",
|
||||
"pp_address": "1 Example Road",
|
||||
"_property_match_postcode": "AA11AA",
|
||||
"_property_match_address": "1 EXAMPLE ROAD",
|
||||
"_property_epc_match_address": "1 EXAMPLE ROAD",
|
||||
"uprn": None,
|
||||
}
|
||||
return pl.DataFrame(
|
||||
[{**base, **row} for row in rows],
|
||||
schema={
|
||||
"postcode": pl.Utf8,
|
||||
"pp_address": pl.Utf8,
|
||||
"_property_match_postcode": pl.Utf8,
|
||||
"_property_match_address": pl.Utf8,
|
||||
"_property_epc_match_address": pl.Utf8,
|
||||
"uprn": pl.Utf8,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def test_match_listing_properties_uprn_wins_dedup_tie() -> None:
|
||||
# Two listings claim the same property: one by UPRN, one by exact address
|
||||
# (both score 100). The UPRN match must win even though it has the higher
|
||||
# _listing_idx (which would otherwise break the tie the wrong way).
|
||||
listings = _listing_matches(
|
||||
[
|
||||
{
|
||||
"_listing_idx": 5,
|
||||
"_listing_uprn": "100000000001",
|
||||
"_listing_match_address": "SOMETHING ELSE",
|
||||
},
|
||||
{
|
||||
"_listing_idx": 1,
|
||||
"_listing_uprn": None,
|
||||
"_listing_match_address": "1 EXAMPLE ROAD",
|
||||
},
|
||||
]
|
||||
)
|
||||
matches = _match_listing_properties(
|
||||
listings, _property_candidates([{"uprn": "100000000001"}])
|
||||
)
|
||||
|
||||
assert matches.height == 1
|
||||
assert matches["_listing_idx"].to_list() == [5]
|
||||
assert matches["_property_match_method"].to_list() == ["uprn"]
|
||||
|
||||
|
||||
def test_match_direct_epc_does_not_match_other_postcode_without_uprn() -> None:
|
||||
# Matching is by postcode/UPRN/street — never by coordinate proximity — so a
|
||||
# same-street EPC in a different postcode with no shared UPRN is skipped.
|
||||
matches = _match_direct_epc(
|
||||
_listing_matches([{"_listing_match_postcode": "AA11AA"}]),
|
||||
_direct_epc_candidates(
|
||||
[{"_direct_epc_match_postcode": "BB22BB", "_direct_epc_uprn": None}]
|
||||
),
|
||||
)
|
||||
|
||||
assert matches.height == 0
|
||||
|
||||
|
||||
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
|
||||
|
|
@ -588,11 +694,72 @@ def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -
|
|||
assert other["_actual_listing_url"].to_list() == [None]
|
||||
|
||||
|
||||
def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None:
|
||||
def test_integrate_listings_matches_by_uprn_over_address(tmp_path) -> None:
|
||||
# The listing's address deliberately does not match the property's, but the
|
||||
# shared UPRN drives an exact match anyway (UPRN beats fuzzy street).
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
_sample_listings_frame().with_columns(
|
||||
pl.lit("Rose Cottage High Street").alias("Address per Property Register"),
|
||||
pl.lit("Totally Different Road").alias("Address per Property Register"),
|
||||
pl.lit("100000000009").alias("UPRN"),
|
||||
).write_parquet(listings_path)
|
||||
_stub_arcgis(arcgis_path)
|
||||
wide = pl.DataFrame(
|
||||
{
|
||||
"postcode": ["SW1A 1AA"],
|
||||
"pp_address": ["1 Example Road"],
|
||||
"uprn": ["100000000009"],
|
||||
"pp_property_type": ["Terraced"],
|
||||
"duration": ["Freehold"],
|
||||
"total_floor_area": [90.0],
|
||||
"number_habitable_rooms": [4],
|
||||
"latest_price": [600_000],
|
||||
"epc_address": ["1 Example Road"],
|
||||
"current_energy_rating": ["C"],
|
||||
"potential_energy_rating": ["B"],
|
||||
"floor_height": [2.4],
|
||||
"construction_age_band": [1930],
|
||||
"is_construction_date_approximate": [1],
|
||||
"was_council_house": ["No"],
|
||||
},
|
||||
schema={
|
||||
"postcode": pl.Utf8,
|
||||
"pp_address": pl.Utf8,
|
||||
"uprn": pl.Utf8,
|
||||
"pp_property_type": pl.Utf8,
|
||||
"duration": pl.Utf8,
|
||||
"total_floor_area": pl.Float64,
|
||||
"number_habitable_rooms": pl.Int16,
|
||||
"latest_price": pl.Int64,
|
||||
"epc_address": pl.Utf8,
|
||||
"current_energy_rating": pl.Utf8,
|
||||
"potential_energy_rating": pl.Utf8,
|
||||
"floor_height": pl.Float64,
|
||||
"construction_age_band": pl.UInt16,
|
||||
"is_construction_date_approximate": pl.UInt8,
|
||||
"was_council_house": pl.Utf8,
|
||||
},
|
||||
)
|
||||
|
||||
integrated = _integrate_listings(
|
||||
wide.lazy(), listings_path, arcgis_path, epc_path=None
|
||||
).collect()
|
||||
|
||||
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
|
||||
# The listing overlay attached to the UPRN-matched property row.
|
||||
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
||||
# No spurious seed row for the listing's (non-matching) address.
|
||||
assert "Totally Different Road" not in integrated["pp_address"].to_list()
|
||||
|
||||
|
||||
def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> None:
|
||||
# A number-less listing whose street is not the property's street (and which
|
||||
# shares no UPRN) must not be force-matched onto it; it becomes its own seed
|
||||
# row instead of stamping the wrong property's overlay.
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
_sample_listings_frame().with_columns(
|
||||
pl.lit("Juniper Crescent").alias("Address per Property Register"),
|
||||
).write_parquet(listings_path)
|
||||
_stub_arcgis(arcgis_path)
|
||||
wide = pl.DataFrame(
|
||||
|
|
@ -635,7 +802,7 @@ def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) ->
|
|||
).collect()
|
||||
|
||||
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
|
||||
seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street")
|
||||
seed = integrated.filter(pl.col("pp_address") == "Juniper Crescent")
|
||||
assert existing["_actual_listing_url"].to_list() == [None]
|
||||
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
||||
|
||||
|
|
@ -731,3 +898,77 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
# Overlay scaffolding is dropped.
|
||||
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
|
||||
assert dst not in finalized.columns, src
|
||||
|
||||
|
||||
def test_finalize_listings_dedupes_fanned_out_listing_rows() -> None:
|
||||
# The terminated-postcode remap can collapse two distinct wide rows onto the same
|
||||
# (postcode, pp_address), so a single matched listing attaches to both. Finalize
|
||||
# must emit one row per listing URL, not one per collapsed wide row.
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
|
||||
"Address per Property Register": ["1 Example Road", "1 Example Road"],
|
||||
"Address per EPC": ["1 Example Road", "1 Example Road"],
|
||||
"Date of last transaction": [1990.0, 1995.0],
|
||||
"lat": [51.5, 51.5],
|
||||
"lon": [-0.1, -0.1],
|
||||
"Total floor area (sqm)": [100.0, 95.0],
|
||||
"Number of bedrooms & living rooms": [3, 3],
|
||||
"Property type": ["Terraced", "Terraced"],
|
||||
"Leasehold/Freehold": ["Leasehold", "Leasehold"],
|
||||
"Last known price": [500_000, 480_000],
|
||||
"Street tree density percentile": [42.0, 42.0],
|
||||
# Same listing URL on both collapsed rows — the fan-out to fix.
|
||||
"_actual_listing_url": ["url0", "url0"],
|
||||
"_actual_asking_price": [600_000, 600_000],
|
||||
"_actual_asking_price_per_sqm": [5_000, 5_000],
|
||||
"_actual_listing_date": [None, None],
|
||||
"_actual_listing_status": ["For sale", "For sale"],
|
||||
"_actual_listing_features": [["Garden"], ["Garden"]],
|
||||
"_actual_bedrooms": [3, 3],
|
||||
"_actual_bathrooms": [1, 1],
|
||||
"_actual_price_qualifier": ["", ""],
|
||||
"_actual_property_sub_type": ["Mid-Terrace", "Mid-Terrace"],
|
||||
"_actual_lat": [51.51, 51.51],
|
||||
"_actual_lon": [-0.11, -0.11],
|
||||
"_actual_total_floor_area": [110.0, 110.0],
|
||||
"_actual_number_habitable_rooms": [4, 4],
|
||||
"_actual_property_type": ["Terraced", "Terraced"],
|
||||
"_actual_leasehold_freehold": ["Freehold", "Freehold"],
|
||||
},
|
||||
schema={
|
||||
"Postcode": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Address per EPC": pl.Utf8,
|
||||
"Date of last transaction": pl.Float64,
|
||||
"lat": pl.Float64,
|
||||
"lon": pl.Float64,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Number of bedrooms & living rooms": pl.Int16,
|
||||
"Property type": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Last known price": pl.Int64,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"_actual_listing_url": pl.Utf8,
|
||||
"_actual_asking_price": pl.Int64,
|
||||
"_actual_asking_price_per_sqm": pl.Int32,
|
||||
"_actual_listing_date": pl.Datetime("us"),
|
||||
"_actual_listing_status": pl.Utf8,
|
||||
"_actual_listing_features": pl.List(pl.Utf8),
|
||||
"_actual_bedrooms": pl.Int32,
|
||||
"_actual_bathrooms": pl.Int32,
|
||||
"_actual_price_qualifier": pl.Utf8,
|
||||
"_actual_property_sub_type": pl.Utf8,
|
||||
"_actual_lat": pl.Float64,
|
||||
"_actual_lon": pl.Float64,
|
||||
"_actual_total_floor_area": pl.Float64,
|
||||
"_actual_number_habitable_rooms": pl.Int16,
|
||||
"_actual_property_type": pl.Utf8,
|
||||
"_actual_leasehold_freehold": pl.Utf8,
|
||||
},
|
||||
)
|
||||
|
||||
finalized = _finalize_listings(df)
|
||||
|
||||
assert finalized.height == 1
|
||||
assert finalized["Listing URL"].to_list() == ["url0"]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue