try
This commit is contained in:
parent
843d14b7ba
commit
c938b71904
13 changed files with 698 additions and 109 deletions
|
|
@ -21,7 +21,15 @@ from ..utils import (
|
|||
pl.Config.set_tbl_cols(-1)
|
||||
|
||||
RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
||||
MIN_PRICE = 50_000
|
||||
# Value-quality floor for price aggregations. A flat nominal floor is a blunt
|
||||
# tool against a deflating threshold — £50k was completely normal for a 1990s
|
||||
# house, so a 50k floor wrongly discarded ~a third of legitimate 1990s
|
||||
# open-market sales (and deleted properties whose only sales were old/cheap),
|
||||
# biasing early-year price history upward. 10k recovers the large [10k,50k)
|
||||
# band of genuine cheaper sales while still excluding the nominal/junk transfers
|
||||
# (£1 etc.). A small tail of real sub-10k sales is still dropped — a deliberate
|
||||
# conservative tradeoff to keep clearly-implausible transfers out.
|
||||
MIN_PRICE = 10_000
|
||||
|
||||
# Plausible construction-year range; band-derived years outside it (e.g. OCR
|
||||
# noise like 1012 or 2202) are nulled rather than published.
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|||
|
||||
MIN_FLOOR_AREA_M2 = 10
|
||||
CONSERVATION_AREA_FEATURE = "Within conservation area"
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
LISTED_BUILDING_FEATURE = "Listed building"
|
||||
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
||||
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
||||
|
|
@ -92,6 +93,10 @@ _AREA_COLUMNS = [
|
|||
"Noise (dB)",
|
||||
"Max available download speed (Mbps)",
|
||||
CONSERVATION_AREA_FEATURE,
|
||||
# Tree canopy is a 50m-radius percentile around the postcode centroid, so it
|
||||
# is postcode-grain: it belongs in the area output (one value per postcode,
|
||||
# covering property-less postcodes too) rather than duplicated per property.
|
||||
TREE_DENSITY_FEATURE,
|
||||
# Schools
|
||||
"Good+ primary schools within 5km",
|
||||
"Good+ secondary schools within 5km",
|
||||
|
|
@ -116,7 +121,6 @@ _AREA_COLUMNS = [
|
|||
|
||||
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
|
||||
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
|
||||
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
||||
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
|
||||
r"^Tree canopy density percentile within \d+m$"
|
||||
)
|
||||
|
|
@ -818,9 +822,9 @@ def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
|
|||
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
|
||||
never merges unrelated rows.
|
||||
"""
|
||||
return wide.sort(
|
||||
"date_of_transfer", descending=True, nulls_last=True
|
||||
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
|
||||
return wide.sort("date_of_transfer", descending=True, nulls_last=True).unique(
|
||||
subset=["postcode", "pp_address"], keep="first", maintain_order=True
|
||||
)
|
||||
|
||||
|
||||
def _filter_to_active_english_postcodes(
|
||||
|
|
@ -1108,13 +1112,26 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
|||
return epc_band_to_year(pl.col(column))
|
||||
|
||||
|
||||
def _address_score(query: str, candidate: str | None) -> int:
|
||||
def _address_score(query: str, candidate: str | None, *, allow_token_set: bool) -> int:
|
||||
if not candidate:
|
||||
return 0
|
||||
return max(
|
||||
fuzz.token_set_ratio(query, candidate),
|
||||
fuzz.token_sort_ratio(query, candidate),
|
||||
)
|
||||
# token_set_ratio returns 100 whenever the shorter token set is a subset of
|
||||
# the longer. For a NUMBER-LESS query that is unsafe — a single locality
|
||||
# token (e.g. "KINGSWOOD") subsets to 100 against any long address that
|
||||
# merely contains it — so number-less queries score with token_sort_ratio
|
||||
# only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
|
||||
# query the unconditional _numbers_compatible gate has already guaranteed the
|
||||
# candidate carries compatible house numbers, so token_set cannot inflate
|
||||
# across different addresses; allowing it recovers genuine matches where the
|
||||
# scraped listing appends trailing town/county tokens the bare register
|
||||
# address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
|
||||
# DRIVE").
|
||||
if allow_token_set:
|
||||
return max(
|
||||
fuzz.token_set_ratio(query, candidate),
|
||||
fuzz.token_sort_ratio(query, candidate),
|
||||
)
|
||||
return fuzz.token_sort_ratio(query, candidate)
|
||||
|
||||
|
||||
def _has_number(address: str | None) -> bool:
|
||||
|
|
@ -1153,9 +1170,12 @@ def _best_listing_match(
|
|||
``uprn_index`` (postcode-independent, so it is robust even when the
|
||||
listing's postcode is slightly off); (2) failing that, the highest
|
||||
fuzzy street-address similarity within the listing's own postcode bucket.
|
||||
No property-attribute heuristics are used — a house number in the listing
|
||||
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
|
||||
threshold; a number-less address must match the street almost exactly.
|
||||
No property-attribute heuristics are used — `_numbers_compatible` gates
|
||||
every fuzzy match unconditionally (so a number-less listing can never match
|
||||
a numbered property, and vice versa), as in the canonical
|
||||
`fuzzy_join._score_bucket`. A house number additionally lowers the score
|
||||
threshold and (via `_address_score`) permits token_set scoring; a number-less
|
||||
address scores on token_sort only and must match the street almost exactly.
|
||||
|
||||
``addressed_fields`` names the candidate columns to fuzzy-match against (a
|
||||
candidate may carry both a register and an EPC address). Returns
|
||||
|
|
@ -1180,9 +1200,11 @@ def _best_listing_match(
|
|||
address = candidate.get(field)
|
||||
if not address:
|
||||
continue
|
||||
if listing_has_numbers and not _numbers_compatible(query, address):
|
||||
# Unconditional number gate (matches fuzzy_join): a number-less
|
||||
# listing cannot match a numbered candidate and vice versa.
|
||||
if not _numbers_compatible(query, address):
|
||||
continue
|
||||
score = _address_score(query, address)
|
||||
score = _address_score(query, address, allow_token_set=listing_has_numbers)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = candidate
|
||||
|
|
@ -1675,7 +1697,9 @@ def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
|
|||
# "Yes". "Former council house" should fire if EITHER side says so.
|
||||
if raw_column == "was_council_house":
|
||||
return (
|
||||
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
|
||||
pl.when(
|
||||
(pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes")
|
||||
)
|
||||
.then(pl.lit("Yes"))
|
||||
.otherwise(coalesce)
|
||||
.alias(raw_column)
|
||||
|
|
@ -1716,9 +1740,13 @@ def _build_unmatched_listing_seed_rows(
|
|||
"total_floor_area": pl.coalesce(
|
||||
pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
|
||||
),
|
||||
# Prefer the direct-EPC habitable-room count over the listing's value:
|
||||
# the scraped room count is bedrooms + bathrooms (upstream storage.py
|
||||
# defect), so it over-counts. Fall back to the listing value only when
|
||||
# the direct-EPC match has no count.
|
||||
"number_habitable_rooms": pl.coalesce(
|
||||
pl.col("_actual_number_habitable_rooms"),
|
||||
pl.col("_direct_number_habitable_rooms"),
|
||||
pl.col("_actual_number_habitable_rooms"),
|
||||
),
|
||||
"latest_price": pl.col("_actual_asking_price"),
|
||||
}
|
||||
|
|
@ -1836,14 +1864,19 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
|
|||
# Listing coordinates win over the postcode centroid.
|
||||
pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
|
||||
pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
|
||||
# Listing's floor area / rooms override any EPC/PP value when present.
|
||||
# Listing's floor area overrides any EPC/PP value when present.
|
||||
pl.coalesce(
|
||||
pl.col("_actual_total_floor_area").cast(pl.Float64),
|
||||
pl.col("Total floor area (sqm)"),
|
||||
).alias("Total floor area (sqm)"),
|
||||
# Rooms: prefer the EPC habitable-room count and fall back to the listing
|
||||
# value only when no EPC count exists. The scraped "Number of bedrooms &
|
||||
# living rooms" is actually bedrooms + bathrooms (an upstream storage.py
|
||||
# defect), so preferring it would inflate the room count and overwrite a
|
||||
# correct EPC value.
|
||||
pl.coalesce(
|
||||
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
|
||||
pl.col("Number of bedrooms & living rooms"),
|
||||
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
|
||||
).alias("Number of bedrooms & living rooms"),
|
||||
pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
|
||||
.then(pl.col("_actual_property_type"))
|
||||
|
|
@ -2130,12 +2163,15 @@ def _build(
|
|||
pl.when(
|
||||
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
||||
& (
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
|
||||
(pl.col("latest_price") / pl.col("total_floor_area")).is_between(
|
||||
MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM
|
||||
)
|
||||
)
|
||||
)
|
||||
.then(
|
||||
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
|
||||
(pl.col("latest_price") / pl.col("total_floor_area"))
|
||||
.round(0)
|
||||
.cast(pl.Int32)
|
||||
)
|
||||
.otherwise(None)
|
||||
.alias("Price per sqm"),
|
||||
|
|
|
|||
|
|
@ -378,7 +378,10 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
|
|||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [30_000, 300_000],
|
||||
# 5_000 is below MIN_PRICE (10_000) — a nominal/junk transfer that
|
||||
# must still anchor the construction year but stay out of the price
|
||||
# aggregations.
|
||||
"price": [5_000, 300_000],
|
||||
"date_of_transfer": [date(2015, 2, 3), date(2022, 2, 3)],
|
||||
"property_type": ["T", "T"],
|
||||
"postcode": ["AA1 1AA", "AA1 1AA"],
|
||||
|
|
@ -408,6 +411,48 @@ def test_run_new_build_keeps_early_first_transfer_when_sub_min_price(tmp_path: P
|
|||
assert df.get_column("historical_prices").list.len().to_list() == [1]
|
||||
|
||||
|
||||
def test_run_keeps_sale_above_lowered_min_price(tmp_path: Path):
|
||||
# A genuine cheap sale of 30_000 sits between the OLD floor (50k) and the
|
||||
# NEW floor (10k): it must now be RETAINED in the price aggregations. This
|
||||
# pins the 50k->10k change — it fails on the pre-fix 50k floor (where 30k was
|
||||
# excluded, giving historical_prices length 1 / latest_price 250_000).
|
||||
zip_path = tmp_path / "domestic-csv.zip"
|
||||
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as archive:
|
||||
csv_buffer = io.StringIO()
|
||||
writer = csv.DictWriter(csv_buffer, fieldnames=EPC_SOURCE_COLUMNS)
|
||||
writer.writeheader()
|
||||
writer.writerow(_row())
|
||||
archive.writestr("certificates-2024.csv", csv_buffer.getvalue())
|
||||
|
||||
price_paid_path = tmp_path / "price-paid.parquet"
|
||||
pl.DataFrame(
|
||||
{
|
||||
"price": [250_000, 30_000],
|
||||
"date_of_transfer": [date(2018, 2, 3), date(2022, 2, 3)],
|
||||
"property_type": ["T", "T"],
|
||||
"postcode": ["AA1 1AA", "AA1 1AA"],
|
||||
"paon": ["1", "1"],
|
||||
"saon": [None, None],
|
||||
"street": ["Example Street", "Example Street"],
|
||||
"locality": [None, None],
|
||||
"town_city": ["Exampletown", "Exampletown"],
|
||||
"duration": ["F", "F"],
|
||||
"old_new": ["N", "N"],
|
||||
"ppd_category": ["A", "A"],
|
||||
}
|
||||
).write_parquet(price_paid_path)
|
||||
|
||||
output_path = tmp_path / "epc-pp.parquet"
|
||||
_run(zip_path, price_paid_path, output_path, tmp_path)
|
||||
|
||||
df = pl.read_parquet(output_path)
|
||||
|
||||
assert df.height == 1
|
||||
# Both sales now survive the 10k floor; the 30_000 (2022) is the most recent.
|
||||
assert df.get_column("historical_prices").list.len().to_list() == [2]
|
||||
assert df.get_column("latest_price").to_list() == [30_000]
|
||||
|
||||
|
||||
def test_epc_band_to_year_uses_midpoint_and_clamps():
|
||||
import polars as pl
|
||||
|
||||
|
|
|
|||
|
|
@ -13,6 +13,7 @@ from pipeline.transform.merge import (
|
|||
_active_english_postcode_area,
|
||||
_build_unmatched_listing_seed_rows,
|
||||
_canonical_postcode_expr,
|
||||
_best_listing_match,
|
||||
_coalesce_direct_epc_columns,
|
||||
_dedupe_collapsed_properties,
|
||||
_filter_to_active_english_postcodes,
|
||||
|
|
@ -78,6 +79,40 @@ def test_conservation_area_feature_is_area_level() -> None:
|
|||
assert CONSERVATION_AREA_FEATURE in _AREA_COLUMNS
|
||||
|
||||
|
||||
def test_tree_density_is_area_level_and_survives_the_split() -> None:
|
||||
# Street tree density is a postcode-centroid percentile (constant per
|
||||
# postcode), so it must route to the postcode/area output -- not be stripped
|
||||
# by _area_columns_from -- and must NOT be duplicated into the property
|
||||
# output. Regression for the drift where it landed only in properties.parquet
|
||||
# and was lost for the ~308k property-less postcodes.
|
||||
assert TREE_DENSITY_FEATURE in _AREA_COLUMNS
|
||||
|
||||
df = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA"],
|
||||
"Last known price": [250_000],
|
||||
TREE_DENSITY_FEATURE: [42.0],
|
||||
}
|
||||
)
|
||||
postcode_features = pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA", "BB1 1BB"],
|
||||
"lat": [51.0, 52.0],
|
||||
"lon": [-0.1, -0.2],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
TREE_DENSITY_FEATURE: [42.0, 7.0],
|
||||
}
|
||||
)
|
||||
|
||||
postcode_df, properties_df = _split_normal_outputs(
|
||||
df, postcode_features, expected_postcode_count=2
|
||||
)
|
||||
|
||||
assert TREE_DENSITY_FEATURE in postcode_df.columns
|
||||
assert postcode_df[TREE_DENSITY_FEATURE].to_list() == [42.0, 7.0]
|
||||
assert TREE_DENSITY_FEATURE not in properties_df.columns
|
||||
|
||||
|
||||
def test_crime_columns_are_spatial_counts_not_per_capita() -> None:
|
||||
# Crime is now a raw spatial count per postcode; the per-1k-residents
|
||||
# variants were dropped along with the LSOA population denominator.
|
||||
|
|
@ -767,6 +802,41 @@ def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
|
|||
assert seed["was_council_house"].to_list() == ["No"]
|
||||
|
||||
|
||||
def test_build_unmatched_listing_seed_rows_prefers_direct_epc_rooms_over_listing(
|
||||
tmp_path,
|
||||
) -> None:
|
||||
# When BOTH the listing room count and a direct-EPC count exist, the EPC
|
||||
# value must win: the scraped "Number of bedrooms & living rooms" is actually
|
||||
# bedrooms + bathrooms (upstream defect), so preferring it would inflate the
|
||||
# count. This pins the coalesce direction (direct-EPC before listing).
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
_sample_listings_frame().with_columns(
|
||||
# The corrupt listing room count (beds + baths).
|
||||
pl.lit(5, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
|
||||
).write_parquet(listings_path)
|
||||
_stub_arcgis(arcgis_path)
|
||||
|
||||
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
|
||||
# The genuine EPC habitable-room count.
|
||||
pl.lit(3, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
|
||||
)
|
||||
template_schema = pl.Schema(
|
||||
{
|
||||
"postcode": pl.Utf8,
|
||||
"pp_address": pl.Utf8,
|
||||
"number_habitable_rooms": pl.Int16,
|
||||
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
|
||||
}
|
||||
)
|
||||
|
||||
seed = _build_unmatched_listing_seed_rows(
|
||||
listings.select("_listing_idx"), listings, template_schema
|
||||
)
|
||||
|
||||
assert seed["number_habitable_rooms"].to_list() == [3]
|
||||
|
||||
|
||||
_DIRECT_EPC_CANDIDATE_SCHEMA = {
|
||||
"_direct_epc_row": pl.UInt32,
|
||||
"_direct_epc_match_address": pl.Utf8,
|
||||
|
|
@ -1249,6 +1319,98 @@ def test_integrate_listings_seeds_listing_with_unmatched_street(tmp_path) -> Non
|
|||
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
|
||||
|
||||
|
||||
def test_best_listing_match_rejects_numberless_listing_against_numbered_property() -> (
|
||||
None
|
||||
):
|
||||
# Regression: a number-less listing (street/locality only) must NOT match a
|
||||
# numbered property. The number gate is unconditional (like fuzzy_join), and
|
||||
# the score is token_sort_ratio only, so a single locality token can no
|
||||
# longer subset-inflate to 100 against a long numbered address.
|
||||
candidates = [{"pp_address": "FLAT A3 CHESHAM HEIGHTS ST MONICAS ROAD"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="KINGSWOOD",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_best_listing_match_allows_numberless_to_numberless_named_house() -> None:
|
||||
# A number-less listing CAN still match a number-less (named-house) property
|
||||
# when the street/name matches almost exactly.
|
||||
candidates = [{"pp_address": "WOODLANDS HOUSE OAK LANE"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="WOODLANDS HOUSE OAK LANE",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is not None
|
||||
candidate, score, method, field = result
|
||||
assert method == "address"
|
||||
assert score >= 90.0
|
||||
|
||||
|
||||
def test_best_listing_match_still_matches_numbered_listing_to_numbered_property() -> (
|
||||
None
|
||||
):
|
||||
# No regression for numbered listings: the number gate still permits a
|
||||
# compatible house number and the lower with-numbers threshold applies.
|
||||
candidates = [{"pp_address": "10 OAK LANE"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="10 OAK LANE",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is not None
|
||||
_candidate, score, method, _field = result
|
||||
assert method == "address"
|
||||
assert score >= 82.0
|
||||
|
||||
|
||||
def test_best_listing_match_numbered_listing_with_trailing_locality_still_matches() -> (
|
||||
None
|
||||
):
|
||||
# A scraped numbered listing often appends town/county tokens that the bare
|
||||
# Price-Paid register address omits. token_sort alone would score this ~73
|
||||
# (below 82) and drop a correct match; token_set (allowed for numbered
|
||||
# queries, where the number gate makes it safe) recovers it.
|
||||
candidates = [{"pp_address": "105 RIDGEWAY DRIVE"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="105 RIDGEWAY DRIVE BROMLEY KENT",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is not None
|
||||
candidate, score, _method, _field = result
|
||||
assert candidate["pp_address"] == "105 RIDGEWAY DRIVE"
|
||||
assert score >= 82.0
|
||||
|
||||
|
||||
def test_best_listing_match_numbered_query_cannot_subset_inflate_across_numbers() -> (
|
||||
None
|
||||
):
|
||||
# token_set for numbered queries is safe only because the number gate runs
|
||||
# first: a query and candidate with incompatible house numbers never reach
|
||||
# scoring, so token_set cannot inflate "10 OAK LANE" onto "12 OAK LANE".
|
||||
candidates = [{"pp_address": "12 OAK LANE KINGSTON"}]
|
||||
result = _best_listing_match(
|
||||
listing_uprn=None,
|
||||
query="10 OAK LANE",
|
||||
uprn_index={},
|
||||
bucket_candidates=candidates,
|
||||
addressed_fields=["pp_address"],
|
||||
)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
|
||||
None
|
||||
):
|
||||
|
|
@ -1325,9 +1487,12 @@ def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows(
|
|||
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
|
||||
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
|
||||
assert finalized["Last known price"].to_list() == [500_000, 700_000]
|
||||
# Listing's preferred floor area / rooms / property type / tenure.
|
||||
# Listing's preferred floor area / property type / tenure.
|
||||
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
|
||||
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
|
||||
# Rooms prefer the EPC habitable-room count over the listing's beds+baths
|
||||
# value: row 0 keeps the EPC 3 (not the listing's _actual 4); row 1 has no
|
||||
# EPC count so it falls back to the listing's 3.
|
||||
assert finalized["Number of bedrooms & living rooms"].to_list() == [3, 3]
|
||||
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
|
||||
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
|
||||
# Postcode-level feature carried through to both matched and unmatched rows.
|
||||
|
|
|
|||
|
|
@ -5,11 +5,72 @@ import polars as pl
|
|||
from pipeline.transform.transform_poi import (
|
||||
_load_ofsted_ratings,
|
||||
_school_icon_category_expr,
|
||||
osm_groceries_colocated_with_geolytix,
|
||||
transform,
|
||||
transform_grocery_retail_points,
|
||||
)
|
||||
|
||||
|
||||
def test_osm_groceries_colocated_with_geolytix_drops_only_brand_matched_duplicates():
|
||||
# GEOLYTIX is authoritative for its chains. An OSM grocery that sits on top
|
||||
# of a GEOLYTIX store AND carries its brand is the same physical store and
|
||||
# must be dropped; an independent shop at the same spot, and a same-brand
|
||||
# store far from any GEOLYTIX point, must be kept.
|
||||
geolytix = pl.DataFrame(
|
||||
{
|
||||
"category": ["Tesco"],
|
||||
"lat": [51.5000],
|
||||
"lng": [-0.1000],
|
||||
}
|
||||
)
|
||||
osm = pl.DataFrame(
|
||||
{
|
||||
"id": ["dup-brand", "independent", "far-brand"],
|
||||
"name": ["Tesco Express", "Bob's Corner Shop", "Tesco Express"],
|
||||
# ~1 m, ~2 m, and ~55 km from the GEOLYTIX Tesco.
|
||||
"lat": [51.50001, 51.50002, 52.0],
|
||||
"lng": [-0.10001, -0.1000, -1.0],
|
||||
}
|
||||
)
|
||||
|
||||
drop_ids = osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0)
|
||||
|
||||
assert drop_ids == ["dup-brand"]
|
||||
|
||||
|
||||
def test_osm_groceries_colocated_with_geolytix_dedupes_cooperative_spelling():
|
||||
# GEOLYTIX brand "Co-op" tokenises to "coop"; OSM commonly spells it
|
||||
# "The Co-operative Food" -> "cooperative". The alias folds them so the
|
||||
# genuine duplicate is still dropped.
|
||||
geolytix = pl.DataFrame({"category": ["Co-op"], "lat": [53.0], "lng": [-1.5]})
|
||||
osm = pl.DataFrame(
|
||||
{
|
||||
"id": ["coop-dup"],
|
||||
"name": ["The Co-operative Food"],
|
||||
"lat": [53.00001],
|
||||
"lng": [-1.5],
|
||||
}
|
||||
)
|
||||
assert osm_groceries_colocated_with_geolytix(osm, geolytix, radius_m=50.0) == [
|
||||
"coop-dup"
|
||||
]
|
||||
|
||||
|
||||
def test_osm_groceries_colocated_with_geolytix_handles_empty_inputs():
|
||||
geolytix = pl.DataFrame({"category": ["Tesco"], "lat": [51.5], "lng": [-0.1]})
|
||||
empty = pl.DataFrame(
|
||||
schema={"id": pl.Utf8, "name": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
|
||||
)
|
||||
assert osm_groceries_colocated_with_geolytix(empty, geolytix) == []
|
||||
osm = pl.DataFrame(
|
||||
{"id": ["x"], "name": ["Tesco Express"], "lat": [51.5], "lng": [-0.1]}
|
||||
)
|
||||
empty_glx = pl.DataFrame(
|
||||
schema={"category": pl.Utf8, "lat": pl.Float64, "lng": pl.Float64}
|
||||
)
|
||||
assert osm_groceries_colocated_with_geolytix(osm, empty_glx) == []
|
||||
|
||||
|
||||
def _write_boundary(tmp_path):
|
||||
"""A FeatureCollection whose single feature covers the London-area test
|
||||
coords used by the transform() fixtures, so in_england_mask keeps them."""
|
||||
|
|
@ -345,12 +406,7 @@ def test_load_ofsted_ratings_falls_back_to_ungraded_outcome(tmp_path):
|
|||
}
|
||||
).write_parquet(ofsted_path)
|
||||
|
||||
ratings = (
|
||||
_load_ofsted_ratings(ofsted_path)
|
||||
.collect()
|
||||
.sort("urn")
|
||||
.to_dicts()
|
||||
)
|
||||
ratings = _load_ofsted_ratings(ofsted_path).collect().sort("urn").to_dicts()
|
||||
|
||||
assert ratings == [
|
||||
{"urn": 1, "ofsted_rating": "Outstanding"},
|
||||
|
|
@ -384,9 +440,9 @@ def test_school_icon_category_handles_one_sided_age_ranges():
|
|||
},
|
||||
)
|
||||
|
||||
categories = df.select(
|
||||
_school_icon_category_expr().alias("category")
|
||||
)["category"].to_list()
|
||||
categories = df.select(_school_icon_category_expr().alias("category"))[
|
||||
"category"
|
||||
].to_list()
|
||||
|
||||
assert categories == [
|
||||
"Nursery school",
|
||||
|
|
@ -449,6 +505,45 @@ def test_osm_supermarkets_dropped(tmp_path):
|
|||
assert convenience.height == 1
|
||||
|
||||
|
||||
def test_transform_grocery_dedup_drops_only_grocery_aspect(tmp_path):
|
||||
# The _write_transform_inputs fixture seeds 5 GEOLYTIX "Tesco" points at
|
||||
# (51.52, -0.14). An OSM object colocated there carrying "Tesco" in its name
|
||||
# is the same physical store, so its Convenience Store (Groceries) row is a
|
||||
# duplicate and must be dropped — but its NON-grocery aspect (a Post Office
|
||||
# sharing the same OSM id) must survive. An independent shop away from the
|
||||
# GEOLYTIX point keeps its grocery row.
|
||||
raw = pl.DataFrame(
|
||||
{
|
||||
"id": ["n1", "n1", "n2"],
|
||||
"name": ["Tesco Express", "Tesco Express", "Corner Shop"],
|
||||
"category": [
|
||||
"shop/convenience",
|
||||
"amenity/post_office",
|
||||
"shop/convenience",
|
||||
],
|
||||
"lat": [51.52, 51.52, 51.40],
|
||||
"lng": [-0.14, -0.14, -0.05],
|
||||
}
|
||||
)
|
||||
inputs = _write_transform_inputs(tmp_path, raw)
|
||||
|
||||
out = transform(**inputs).collect()
|
||||
|
||||
# The colocated, brand-matched grocery row is dropped.
|
||||
n1_grocery = out.filter((pl.col("id") == "n1") & (pl.col("group") == "Groceries"))
|
||||
assert n1_grocery.height == 0
|
||||
# Its non-grocery aspect (Post Office) survives.
|
||||
n1_post_office = out.filter(
|
||||
(pl.col("id") == "n1") & (pl.col("category") == "Post Office")
|
||||
)
|
||||
assert n1_post_office.height == 1
|
||||
# The independent corner shop (no brand, far away) keeps its grocery row.
|
||||
n2_grocery = out.filter(
|
||||
(pl.col("id") == "n2") & (pl.col("category") == "Convenience Store")
|
||||
)
|
||||
assert n2_grocery.height == 1
|
||||
|
||||
|
||||
def test_transform_output_unique_per_id_category(tmp_path):
|
||||
# Soundness: the full transform() output has at most one row per
|
||||
# (id, category) overall, across every source.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.utils.england_geometry import in_england_mask
|
||||
|
|
@ -955,7 +956,6 @@ _CATEGORIES: list[tuple[str, str, str, list[str]]] = [
|
|||
# Note: schools come from the GIAS register (see transform_gias_schools).
|
||||
# Niche/tertiary education amenities that GIAS does not cover are dropped
|
||||
# rather than mixed in with state-funded schools.
|
||||
|
||||
(
|
||||
"Local Businesses",
|
||||
"Hotel",
|
||||
|
|
@ -1441,38 +1441,128 @@ def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
|||
# category mirrors icon_category so the dashboard renders one toggle per
|
||||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||||
# instead of bundling every GIAS row under a single "School" pill.
|
||||
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
|
||||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||||
pl.col("name"),
|
||||
icon_category_expr.alias("category"),
|
||||
icon_category_expr.alias("icon_category"),
|
||||
pl.lit("Education").alias("group"),
|
||||
pl.col("lat").cast(pl.Float64),
|
||||
pl.col("lng").cast(pl.Float64),
|
||||
emoji_expr.alias("emoji"),
|
||||
pl.col("phase").alias("school_phase"),
|
||||
pl.col("type").alias("school_type"),
|
||||
pl.col("type_group").alias("school_type_group"),
|
||||
pl.col("age_range").alias("school_age_range"),
|
||||
pl.col("gender").alias("school_gender"),
|
||||
pl.col("religious_character").alias("school_religious_character"),
|
||||
pl.col("admissions_policy").alias("school_admissions_policy"),
|
||||
pl.col("nursery_provision").alias("school_nursery_provision"),
|
||||
pl.col("sixth_form").alias("school_sixth_form"),
|
||||
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
|
||||
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
|
||||
pl.col("fsm_percent").cast(pl.Float32, strict=False).alias("school_fsm_percent"),
|
||||
pl.col("trust").alias("school_trust"),
|
||||
pl.col("address").alias("school_address"),
|
||||
pl.col("postcode").alias("school_postcode"),
|
||||
pl.col("local_authority").alias("school_local_authority"),
|
||||
pl.col("website").alias("school_website"),
|
||||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
return (
|
||||
pl.scan_parquet(gias_path)
|
||||
.join(ofsted, on="urn", how="left")
|
||||
.select(
|
||||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||||
pl.col("name"),
|
||||
icon_category_expr.alias("category"),
|
||||
icon_category_expr.alias("icon_category"),
|
||||
pl.lit("Education").alias("group"),
|
||||
pl.col("lat").cast(pl.Float64),
|
||||
pl.col("lng").cast(pl.Float64),
|
||||
emoji_expr.alias("emoji"),
|
||||
pl.col("phase").alias("school_phase"),
|
||||
pl.col("type").alias("school_type"),
|
||||
pl.col("type_group").alias("school_type_group"),
|
||||
pl.col("age_range").alias("school_age_range"),
|
||||
pl.col("gender").alias("school_gender"),
|
||||
pl.col("religious_character").alias("school_religious_character"),
|
||||
pl.col("admissions_policy").alias("school_admissions_policy"),
|
||||
pl.col("nursery_provision").alias("school_nursery_provision"),
|
||||
pl.col("sixth_form").alias("school_sixth_form"),
|
||||
pl.col("capacity").cast(pl.Int32, strict=False).alias("school_capacity"),
|
||||
pl.col("pupils").cast(pl.Int32, strict=False).alias("school_pupils"),
|
||||
pl.col("fsm_percent")
|
||||
.cast(pl.Float32, strict=False)
|
||||
.alias("school_fsm_percent"),
|
||||
pl.col("trust").alias("school_trust"),
|
||||
pl.col("address").alias("school_address"),
|
||||
pl.col("postcode").alias("school_postcode"),
|
||||
pl.col("local_authority").alias("school_local_authority"),
|
||||
pl.col("website").alias("school_website"),
|
||||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# OSM convenience-format stores that GEOLYTIX also covers (Tesco Express,
|
||||
# Sainsbury's Local, Co-op Food, Morrisons Daily, Spar, ...) would otherwise be
|
||||
# counted twice: once as a GEOLYTIX brand row and once as an OSM "Convenience
|
||||
# Store". GEOLYTIX is authoritative for its chains, so an OSM grocery row that
|
||||
# sits on top of a GEOLYTIX point AND carries that point's brand name is the
|
||||
# same physical store and is dropped. Independent corner shops never carry a
|
||||
# chain brand, so they are kept.
|
||||
GROCERY_DEDUP_RADIUS_M = 50.0
|
||||
|
||||
# Brand-token aliases so an OSM name spelt differently from the GEOLYTIX brand
|
||||
# still matches. GEOLYTIX's "Co-op" tokenises to "coop", but OSM frequently
|
||||
# spells it "The Co-operative Food" -> "cooperative"; without this, ~300+ genuine
|
||||
# Co-op duplicates would survive. Keys/values are post-strip (alnum-only) tokens.
|
||||
_GROCERY_TOKEN_ALIASES = {
|
||||
"cooperative": "coop",
|
||||
"cooperatives": "coop",
|
||||
}
|
||||
|
||||
|
||||
def _significant_tokens(name: str | None) -> set[str]:
|
||||
"""Lower-case alphanumeric tokens of length >= 3 from a POI name (aliased)."""
|
||||
if not name:
|
||||
return set()
|
||||
tokens: set[str] = set()
|
||||
for raw in str(name).lower().split():
|
||||
token = "".join(ch for ch in raw if ch.isalnum())
|
||||
if len(token) >= 3:
|
||||
tokens.add(_GROCERY_TOKEN_ALIASES.get(token, token))
|
||||
return tokens
|
||||
|
||||
|
||||
def osm_groceries_colocated_with_geolytix(
|
||||
osm_groceries: pl.DataFrame,
|
||||
geolytix: pl.DataFrame,
|
||||
radius_m: float = GROCERY_DEDUP_RADIUS_M,
|
||||
) -> list[str]:
|
||||
"""Return OSM grocery ids that duplicate a GEOLYTIX store.
|
||||
|
||||
An OSM Groceries row is a duplicate when a GEOLYTIX point lies within
|
||||
``radius_m`` metres AND that point's brand tokens (its ``category``, e.g.
|
||||
"Tesco", "Co-op") are all present in the OSM row's name — i.e. the same
|
||||
physical branded store. Brands with no token >= 3 chars (e.g. "M&S") never
|
||||
match, so they are conservatively kept rather than risk a false drop.
|
||||
|
||||
``osm_groceries`` needs columns ``id``, ``name``, ``lat``, ``lng``;
|
||||
``geolytix`` needs ``category`` (the brand), ``lat``, ``lng``.
|
||||
"""
|
||||
if osm_groceries.is_empty() or geolytix.is_empty():
|
||||
return []
|
||||
|
||||
from scipy.spatial import cKDTree
|
||||
|
||||
glx_lat = geolytix["lat"].to_numpy().astype(float)
|
||||
glx_lng = geolytix["lng"].to_numpy().astype(float)
|
||||
glx_brand_tokens = [_significant_tokens(b) for b in geolytix["category"].to_list()]
|
||||
|
||||
osm_lat = osm_groceries["lat"].to_numpy().astype(float)
|
||||
osm_lng = osm_groceries["lng"].to_numpy().astype(float)
|
||||
osm_ids = osm_groceries["id"].to_list()
|
||||
osm_name_tokens = [_significant_tokens(n) for n in osm_groceries["name"].to_list()]
|
||||
|
||||
# Equirectangular projection to metres around the shared mean latitude — at
|
||||
# England's scale this is accurate to well under the dedup radius.
|
||||
mean_lat = float(np.mean(np.concatenate([glx_lat, osm_lat])))
|
||||
cos_lat = float(np.cos(np.radians(mean_lat)))
|
||||
glx_xy = np.column_stack([glx_lng * cos_lat * 111_320.0, glx_lat * 110_540.0])
|
||||
osm_xy = np.column_stack([osm_lng * cos_lat * 111_320.0, osm_lat * 110_540.0])
|
||||
|
||||
tree = cKDTree(glx_xy)
|
||||
neighbours = tree.query_ball_point(osm_xy, r=radius_m)
|
||||
|
||||
drop_ids: list[str] = []
|
||||
for osm_idx, glx_indices in enumerate(neighbours):
|
||||
tokens = osm_name_tokens[osm_idx]
|
||||
if not tokens:
|
||||
continue
|
||||
for glx_idx in glx_indices:
|
||||
brand = glx_brand_tokens[glx_idx]
|
||||
if brand and brand.issubset(tokens):
|
||||
drop_ids.append(osm_ids[osm_idx])
|
||||
break
|
||||
return drop_ids
|
||||
|
||||
|
||||
def transform(
|
||||
input_path: Path,
|
||||
naptan_path: Path,
|
||||
|
|
@ -1553,6 +1643,27 @@ def transform(
|
|||
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
|
||||
# Drop OSM grocery rows that duplicate a GEOLYTIX store (same brand,
|
||||
# colocated) so a Tesco Express / Co-op / Spar isn't counted twice.
|
||||
osm_groceries = (
|
||||
lf.filter(pl.col("group") == "Groceries")
|
||||
.select("id", "name", "lat", "lng")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
duplicate_ids = osm_groceries_colocated_with_geolytix(osm_groceries, grocery_pois)
|
||||
if duplicate_ids:
|
||||
print(
|
||||
f"Dropping {len(duplicate_ids):,} OSM grocery POIs that duplicate a "
|
||||
"GEOLYTIX store"
|
||||
)
|
||||
# Scope the drop to the Groceries group: a single OSM object can also
|
||||
# carry a non-grocery aspect (e.g. a convenience store that is also a
|
||||
# Post Office), which must survive — only its duplicate grocery row goes.
|
||||
lf = lf.filter(
|
||||
~((pl.col("group") == "Groceries") & pl.col("id").is_in(duplicate_ids))
|
||||
)
|
||||
|
||||
frames = [
|
||||
lf,
|
||||
naptan,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue