try
Some checks failed
CI / Check (push) Failing after 3m22s
Build and publish Docker image / build-and-push (push) Successful in 7m25s

This commit is contained in:
Andras Schmelczer 2026-06-04 22:34:26 +01:00
parent 843d14b7ba
commit c938b71904
13 changed files with 698 additions and 109 deletions

View file

@ -30,6 +30,7 @@ from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10
CONSERVATION_AREA_FEATURE = "Within conservation area"
TREE_DENSITY_FEATURE = "Street tree density percentile"
LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
@ -92,6 +93,10 @@ _AREA_COLUMNS = [
"Noise (dB)",
"Max available download speed (Mbps)",
CONSERVATION_AREA_FEATURE,
# Tree canopy is a 50m-radius percentile around the postcode centroid, so it
# is postcode-grain: it belongs in the area output (one value per postcode,
# covering property-less postcodes too) rather than duplicated per property.
TREE_DENSITY_FEATURE,
# Schools
"Good+ primary schools within 5km",
"Good+ secondary schools within 5km",
@ -116,7 +121,6 @@ _AREA_COLUMNS = [
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
TREE_DENSITY_FEATURE = "Street tree density percentile"
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
r"^Tree canopy density percentile within \d+m$"
)
@ -818,9 +822,9 @@ def _dedupe_collapsed_properties(wide: pl.LazyFrame) -> pl.LazyFrame:
untouched. pp_address is non-null here (join_epc_pp filters it), so the key
never merges unrelated rows.
"""
return wide.sort(
"date_of_transfer", descending=True, nulls_last=True
).unique(subset=["postcode", "pp_address"], keep="first", maintain_order=True)
return wide.sort("date_of_transfer", descending=True, nulls_last=True).unique(
subset=["postcode", "pp_address"], keep="first", maintain_order=True
)
def _filter_to_active_english_postcodes(
@ -1108,13 +1112,26 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return epc_band_to_year(pl.col(column))
def _address_score(query: str, candidate: str | None) -> int:
def _address_score(query: str, candidate: str | None, *, allow_token_set: bool) -> int:
if not candidate:
return 0
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
# token_set_ratio returns 100 whenever the shorter token set is a subset of
# the longer. For a NUMBER-LESS query that is unsafe — a single locality
# token (e.g. "KINGSWOOD") subsets to 100 against any long address that
# merely contains it — so number-less queries score with token_sort_ratio
# only, matching the canonical fuzzy_join._score_bucket. For a NUMBERED
# query the unconditional _numbers_compatible gate has already guaranteed the
# candidate carries compatible house numbers, so token_set cannot inflate
# across different addresses; allowing it recovers genuine matches where the
# scraped listing appends trailing town/county tokens the bare register
# address omits (e.g. "105 RIDGEWAY DRIVE BROMLEY KENT" vs "105 RIDGEWAY
# DRIVE").
if allow_token_set:
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
return fuzz.token_sort_ratio(query, candidate)
def _has_number(address: str | None) -> bool:
@ -1153,9 +1170,12 @@ def _best_listing_match(
``uprn_index`` (postcode-independent, so it is robust even when the
listing's postcode is slightly off); (2) failing that, the highest
fuzzy street-address similarity within the listing's own postcode bucket.
No property-attribute heuristics are used a house number in the listing
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
threshold; a number-less address must match the street almost exactly.
No property-attribute heuristics are used `_numbers_compatible` gates
every fuzzy match unconditionally (so a number-less listing can never match
a numbered property, and vice versa), as in the canonical
`fuzzy_join._score_bucket`. A house number additionally lowers the score
threshold and (via `_address_score`) permits token_set scoring; a number-less
address scores on token_sort only and must match the street almost exactly.
``addressed_fields`` names the candidate columns to fuzzy-match against (a
candidate may carry both a register and an EPC address). Returns
@ -1180,9 +1200,11 @@ def _best_listing_match(
address = candidate.get(field)
if not address:
continue
if listing_has_numbers and not _numbers_compatible(query, address):
# Unconditional number gate (matches fuzzy_join): a number-less
# listing cannot match a numbered candidate and vice versa.
if not _numbers_compatible(query, address):
continue
score = _address_score(query, address)
score = _address_score(query, address, allow_token_set=listing_has_numbers)
if score > best_score:
best_score = score
best = candidate
@ -1675,7 +1697,9 @@ def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
# "Yes". "Former council house" should fire if EITHER side says so.
if raw_column == "was_council_house":
return (
pl.when((pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes"))
pl.when(
(pl.col(raw_column) == "Yes") | (pl.col(direct_column) == "Yes")
)
.then(pl.lit("Yes"))
.otherwise(coalesce)
.alias(raw_column)
@ -1716,9 +1740,13 @@ def _build_unmatched_listing_seed_rows(
"total_floor_area": pl.coalesce(
pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
),
# Prefer the direct-EPC habitable-room count over the listing's value:
# the scraped room count is bedrooms + bathrooms (upstream storage.py
# defect), so it over-counts. Fall back to the listing value only when
# the direct-EPC match has no count.
"number_habitable_rooms": pl.coalesce(
pl.col("_actual_number_habitable_rooms"),
pl.col("_direct_number_habitable_rooms"),
pl.col("_actual_number_habitable_rooms"),
),
"latest_price": pl.col("_actual_asking_price"),
}
@ -1836,14 +1864,19 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
# Listing coordinates win over the postcode centroid.
pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
# Listing's floor area / rooms override any EPC/PP value when present.
# Listing's floor area overrides any EPC/PP value when present.
pl.coalesce(
pl.col("_actual_total_floor_area").cast(pl.Float64),
pl.col("Total floor area (sqm)"),
).alias("Total floor area (sqm)"),
# Rooms: prefer the EPC habitable-room count and fall back to the listing
# value only when no EPC count exists. The scraped "Number of bedrooms &
# living rooms" is actually bedrooms + bathrooms (an upstream storage.py
# defect), so preferring it would inflate the room count and overwrite a
# correct EPC value.
pl.coalesce(
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
pl.col("Number of bedrooms & living rooms"),
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
).alias("Number of bedrooms & living rooms"),
pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
.then(pl.col("_actual_property_type"))
@ -2130,12 +2163,15 @@ def _build(
pl.when(
(pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
& (
(pl.col("latest_price") / pl.col("total_floor_area"))
.is_between(MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM)
(pl.col("latest_price") / pl.col("total_floor_area")).is_between(
MIN_COMPARABLE_PSM, MAX_COMPARABLE_PSM
)
)
)
.then(
(pl.col("latest_price") / pl.col("total_floor_area")).round(0).cast(pl.Int32)
(pl.col("latest_price") / pl.col("total_floor_area"))
.round(0)
.cast(pl.Int32)
)
.otherwise(None)
.alias("Price per sqm"),