This commit is contained in:
Andras Schmelczer 2026-05-28 21:48:35 +01:00
parent 39ef5c6646
commit c995f12f8b
78 changed files with 4830 additions and 1619 deletions

View file

@ -1,960 +0,0 @@
import argparse
import re
import tempfile
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10.0
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
PROPERTY_MATCH_MIN_MARGIN = 4.0
EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
EPC_MATCH_MIN_MARGIN = 4.0
ENRICHMENT_VERSION = 1
_NUMBER_RE = re.compile(r"\d+")
LISTING_REQUIRED_COLUMNS = [
"Bedrooms",
"Bathrooms",
"Number of bedrooms & living rooms",
"lon",
"lat",
"Postcode",
"Address per Property Register",
"Leasehold/Freehold",
"Property type",
"Property sub-type",
"Price qualifier",
"Total floor area (sqm)",
"Listing URL",
"Listing features",
"Listing date",
"Listing status",
"Asking price",
"Asking price per sqm",
]
PROPERTY_CANDIDATE_COLUMNS = [
"Address per Property Register",
"Postcode",
"Leasehold/Freehold",
"Last known price",
"Date of last transaction",
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Interior height (m)",
"Construction year",
"Former council house",
"Is construction date approximate",
"Listed building",
"Estimated monthly rent",
"Street tree density percentile",
"Property type",
"Price per sqm",
"Estimated current price",
"Est. price per sqm",
]
PROPERTY_ENRICHMENT_COLUMNS = [
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Interior height (m)",
"Construction year",
"Former council house",
"Is construction date approximate",
"Listed building",
"Estimated monthly rent",
"Street tree density percentile",
"Date of last transaction",
]
EPC_ENRICHMENT_COLUMNS = [
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Interior height (m)",
"Construction year",
"Former council house",
]
EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
TENURE_VALUES = ["Freehold", "Leasehold"]
PROPERTY_TYPE_VALUES = [
"Detached",
"Semi-Detached",
"Terraced",
"Flats/Maisonettes",
"Other",
]
COLUMN_DTYPES = {
"Address per EPC": pl.Utf8,
"Current energy rating": pl.Utf8,
"Potential energy rating": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int32,
"Interior height (m)": pl.Float64,
"Construction year": pl.UInt16,
"Former council house": pl.Utf8,
"Is construction date approximate": pl.UInt8,
"Listed building": pl.Utf8,
"Estimated monthly rent": pl.Float32,
"Street tree density percentile": pl.Float32,
"Date of last transaction": pl.Datetime("us"),
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
}
def _canonical_postcode_expr(column: str) -> pl.Expr:
compact = (
pl.col(column)
.cast(pl.Utf8)
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", "")
.str.strip_chars()
)
return (
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
.otherwise(None)
)
def _clean_string_expr(column: str) -> pl.Expr:
stripped = pl.col(column).cast(pl.Utf8).str.strip_chars()
return pl.when(stripped == "").then(None).otherwise(stripped)
def _coalesce_non_empty(*columns: str) -> pl.Expr:
return pl.coalesce(
[
pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "")
.then(None)
.otherwise(pl.col(column).cast(pl.Utf8))
for column in columns
]
)
def _valid_number_expr(column: str) -> pl.Expr:
return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None)
def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
schema = pl.scan_parquet(listings_path).collect_schema()
missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names()))
if missing:
raise ValueError(f"{listings_path} is missing listing columns: {missing}")
listings = (
pl.scan_parquet(listings_path)
.with_row_index("_listing_idx")
.with_columns(
_canonical_postcode_expr("Postcode").alias("_original_postcode"),
normalize_address_key(pl.col("Address per Property Register")).alias(
"_listing_match_address"
),
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
)
.collect(engine="streaming")
)
postcode_mapping = build_postcode_mapping(arcgis_path)
listings = (
listings.join(
postcode_mapping,
left_on="_original_postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias(
"Postcode"
),
)
.drop("new_postcode", strict=False)
.with_columns(
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
)
)
return listings
def _load_property_candidates(
properties_path: Path, listing_postcodes: list[str]
) -> pl.DataFrame:
schema = pl.scan_parquet(properties_path).collect_schema()
columns = [
column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names()
]
missing = sorted(
set(
[
"Address per Property Register",
"Postcode",
"Property type",
"Total floor area (sqm)",
]
)
- set(columns)
)
if missing:
raise ValueError(f"{properties_path} is missing property columns: {missing}")
return (
pl.scan_parquet(properties_path)
.select(columns)
.with_columns(
normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode")
)
.filter(pl.col("_match_postcode").is_in(listing_postcodes))
.with_columns(
normalize_address_key(pl.col("Address per Property Register")).alias(
"_match_register_address"
),
normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address")
if "Address per EPC" in columns
else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"),
)
.filter(
pl.col("_match_register_address").is_not_null()
| pl.col("_match_epc_address").is_not_null()
)
.with_row_index("_property_row")
.collect(engine="streaming")
)
def _property_candidates_by_postcode(
candidates: pl.DataFrame,
) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _numbers_compatible(left: str | None, right: str | None) -> bool:
if not left or not right:
return False
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
(left_nums, right_nums)
if len(left_nums) <= len(right_nums)
else (right_nums, left_nums)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _has_number(address: str | None) -> bool:
return bool(address and _NUMBER_RE.search(address))
def _ratio_bonus(
left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
if left is None or right is None:
return 0.0
try:
left_f = float(left)
right_f = float(right)
except (TypeError, ValueError):
return 0.0
if left_f <= 0 or right_f <= 0:
return 0.0
rel = abs(left_f - right_f) / max(left_f, right_f)
if rel > pct:
return 0.0
return cap * (1.0 - rel / pct)
def _rooms_bonus(left: int | None, right: int | None) -> float:
if left is None or right is None:
return 0.0
try:
diff = abs(int(left) - int(right))
except (TypeError, ValueError):
return 0.0
if diff == 0:
return 4.0
if diff == 1:
return 2.0
return 0.0
def _enum_bonus(
left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
if not left or not right:
return 0.0
return exact if left == right else mismatch
def _address_score(query: str, candidate: str | None) -> int:
if not candidate:
return 0
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict, str]] = []
for candidate in candidates:
register_address = candidate.get("_match_register_address")
epc_address = candidate.get("_match_epc_address")
if listing_has_numbers and not (
_numbers_compatible(query, register_address)
or _numbers_compatible(query, epc_address)
):
continue
register_score = _address_score(query, register_address)
epc_score = _address_score(query, epc_address)
base_score = max(register_score, epc_score)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("Property type"),
candidate.get("Property type"),
exact=7.0,
mismatch=-8.0,
)
score += _enum_bonus(
listing.get("Leasehold/Freehold"),
candidate.get("Leasehold/Freehold"),
exact=3.0,
mismatch=-3.0,
)
score += _ratio_bonus(
listing.get("Total floor area (sqm)"),
candidate.get("Total floor area (sqm)"),
pct=0.15,
cap=8.0,
)
score += _rooms_bonus(
listing.get("Number of bedrooms & living rooms"),
candidate.get("Number of bedrooms & living rooms"),
)
score += _ratio_bonus(
listing.get("Asking price"),
candidate.get("Estimated current price")
or candidate.get("Last known price"),
pct=0.25,
cap=3.0,
)
matched_address = (
"Address per Property Register"
if register_score >= epc_score
else "Address per EPC"
)
scored.append((score, base_score, candidate, matched_address))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN:
return None
return {
"_listing_idx": listing["_listing_idx"],
"_property_row": top[2]["_property_row"],
"Historical property match score": round(top[0], 1),
"Historical property address score": top[1],
"Historical property match margin": round(margin, 1),
"Historical property match field": top[3],
"Historical property match status": "matched",
}
def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
schema = {
"_listing_idx": pl.UInt32,
"_property_row": pl.UInt32,
"Historical property match score": pl.Float32,
"Historical property address score": pl.Int32,
"Historical property match margin": pl.Float32,
"Historical property match field": pl.Utf8,
"Historical property match status": pl.Utf8,
}
if candidates.is_empty():
return pl.DataFrame(schema=schema)
buckets = _property_candidates_by_postcode(candidates)
matches = []
for listing in tqdm(
listings.iter_rows(named=True),
total=listings.height,
desc="Matching historical properties",
):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_property_candidate(listing, buckets.get(postcode, []))
if match is not None:
matches.append(match)
if not matches:
return pl.DataFrame(schema=schema)
return pl.DataFrame(matches, schema=schema)
def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame:
rename = {column: f"{prefix}{column}" for column in columns if column in df.columns}
return df.rename(rename)
def _ensure_prefixed_columns(
df: pl.DataFrame, columns: list[str], prefix: str
) -> pl.DataFrame:
missing_exprs = [
pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias(
f"{prefix}{column}"
)
for column in columns
if f"{prefix}{column}" not in df.columns
]
if not missing_exprs:
return df
return df.with_columns(missing_exprs)
def _property_match_frame(
matches: pl.DataFrame, candidates: pl.DataFrame
) -> pl.DataFrame:
if matches.is_empty():
return matches
selected_columns = [
"_property_row",
*[
column
for column in PROPERTY_CANDIDATE_COLUMNS
if column in candidates.columns
],
]
matched = matches.join(
candidates.select(selected_columns), on="_property_row", how="left"
)
return _prefix_columns(
matched,
[column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns],
"_property_",
)
def _canonical_epc_property_type_expr() -> pl.Expr:
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
return (
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(None)
.replace(
{
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
"House": "Other",
}
)
)
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return (
pl.col(column)
.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
def _fractional_year_expr(column: str) -> pl.Expr:
return (
pl.col(column).dt.year().cast(pl.Float32)
+ (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0
)
def _load_epc_candidates(
epc_path: Path, listing_postcodes: list[str], temp_dir: Path
) -> pl.DataFrame:
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
epc = (
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
.sort("inspection_date", descending=True)
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.with_columns(
_canonical_epc_property_type_expr().alias("_epc_canonical_property_type"),
_construction_year_expr().alias("Construction year"),
pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES))
.then(pl.col("current_energy_rating"))
.otherwise(None)
.alias("Current energy rating"),
pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES))
.then(pl.col("potential_energy_rating"))
.otherwise(None)
.alias("Potential energy rating"),
pl.col("total_floor_area").alias("Total floor area (sqm)"),
pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"),
pl.col("floor_height").alias("Interior height (m)"),
pl.col("epc_address").alias("Address per EPC"),
)
.drop("tenure", strict=False)
)
social_tenure = (
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("_epc_match_address", "_epc_match_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("Former council house"))
)
return (
epc.join(
social_tenure,
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.with_columns(pl.col("Former council house").fill_null("No"))
.filter(pl.col("_epc_match_address").is_not_null())
.with_row_index("_epc_row")
.select(
"_epc_row",
"_epc_match_address",
"_epc_match_postcode",
"_epc_canonical_property_type",
*EPC_ENRICHMENT_COLUMNS,
)
.collect(engine="streaming")
)
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_epc_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict]] = []
for candidate in candidates:
address = candidate.get("_epc_match_address")
if listing_has_numbers and not _numbers_compatible(query, address):
continue
base_score = _address_score(query, address)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("Property type"),
candidate.get("_epc_canonical_property_type"),
exact=6.0,
mismatch=-6.0,
)
score += _ratio_bonus(
listing.get("Total floor area (sqm)"),
candidate.get("Total floor area (sqm)"),
pct=0.12,
cap=8.0,
)
score += _rooms_bonus(
listing.get("Number of bedrooms & living rooms"),
candidate.get("Number of bedrooms & living rooms"),
)
scored.append((score, base_score, candidate))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
EPC_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN:
return None
return {
"_listing_idx": listing["_listing_idx"],
"_epc_row": top[2]["_epc_row"],
"EPC match score": round(top[0], 1),
"EPC address score": top[1],
"EPC match margin": round(margin, 1),
"EPC match status": "matched",
}
def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
schema = {
"_listing_idx": pl.UInt32,
"_epc_row": pl.UInt32,
"EPC match score": pl.Float32,
"EPC address score": pl.Int32,
"EPC match margin": pl.Float32,
"EPC match status": pl.Utf8,
}
if candidates.is_empty():
return pl.DataFrame(schema=schema)
buckets = _epc_candidates_by_postcode(candidates)
matches = []
for listing in tqdm(
listings.iter_rows(named=True),
total=listings.height,
desc="Matching EPC certificates",
):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_epc_candidate(listing, buckets.get(postcode, []))
if match is not None:
matches.append(match)
if not matches:
return pl.DataFrame(schema=schema)
return pl.DataFrame(matches, schema=schema)
def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
if matches.is_empty():
return matches
matched = matches.join(
candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS),
on="_epc_row",
how="left",
)
return _prefix_columns(
matched,
[column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns],
"_epc_",
)
def _join_postcode_features(
listings: pl.DataFrame, postcode_features_path: Path
) -> pl.DataFrame:
postcode_features = pl.scan_parquet(postcode_features_path).collect(
engine="streaming"
)
return listings.join(
postcode_features, on="Postcode", how="left", suffix="_postcode"
)
def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame:
with_columns: list[pl.Expr] = [
pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias(
"Actual listing enrichment version"
),
_coalesce_non_empty(
"_epc_Address per EPC",
"_property_Address per EPC",
).alias("Address per EPC"),
pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES))
.then(pl.col("Property type"))
.otherwise(pl.col("_property_Property type"))
.alias("Property type"),
pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES))
.then(pl.col("Leasehold/Freehold"))
.otherwise(pl.col("_property_Leasehold/Freehold"))
.alias("Leasehold/Freehold"),
pl.coalesce(
_valid_number_expr("Total floor area (sqm)"),
_valid_number_expr("_epc_Total floor area (sqm)"),
_valid_number_expr("_property_Total floor area (sqm)"),
).alias("Total floor area (sqm)"),
pl.when(pl.col("Number of bedrooms & living rooms") > 0)
.then(pl.col("Number of bedrooms & living rooms"))
.otherwise(
pl.coalesce(
pl.col("_epc_Number of bedrooms & living rooms"),
pl.col("_property_Number of bedrooms & living rooms"),
)
)
.cast(pl.Int32, strict=False)
.alias("Number of bedrooms & living rooms"),
pl.col("Asking price").alias("Estimated current price"),
pl.col("Asking price").alias("Last known price"),
_coalesce_non_empty(
"_epc_Current energy rating",
"_property_Current energy rating",
).alias("Current energy rating"),
_coalesce_non_empty(
"_epc_Potential energy rating",
"_property_Potential energy rating",
).alias("Potential energy rating"),
pl.coalesce(
_valid_number_expr("_epc_Interior height (m)"),
_valid_number_expr("_property_Interior height (m)"),
).alias("Interior height (m)"),
pl.coalesce(
pl.col("_epc_Construction year"),
pl.col("_property_Construction year"),
)
.cast(pl.UInt16, strict=False)
.alias("Construction year"),
_coalesce_non_empty(
"_epc_Former council house",
"_property_Former council house",
)
.fill_null("No")
.alias("Former council house"),
pl.col("_property_Is construction date approximate").alias(
"Is construction date approximate"
),
pl.col("_property_Listed building").fill_null("No").alias("Listed building"),
pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"),
pl.col("_property_Street tree density percentile").alias(
"Street tree density percentile"
),
_fractional_year_expr("_property_Date of last transaction").alias(
"Date of last transaction"
),
]
df = df.with_columns(with_columns)
df = df.with_columns(
pl.when(
pl.col("Asking price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
)
.then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0))
.otherwise(None)
.cast(pl.Int32, strict=False)
.alias("Asking price per sqm"),
).with_columns(
pl.col("Asking price per sqm").alias("Est. price per sqm"),
pl.col("Asking price per sqm").alias("Price per sqm"),
)
return df
def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame:
internal_prefixes = ("_property_", "_epc_")
internal_exact = {
"_listing_idx",
"_listing_match_address",
"_listing_match_postcode",
"_original_postcode",
"_property_row",
"_epc_row",
"lat_postcode",
"lon_postcode",
}
drop_cols = [
column
for column in df.columns
if column in internal_exact or column.startswith(internal_prefixes)
]
return df.drop(drop_cols, strict=False)
def build_enriched_actual_listings(
listings_path: Path,
properties_path: Path,
postcode_features_path: Path,
arcgis_path: Path,
output_path: Path,
*,
epc_path: Path | None = None,
) -> pl.DataFrame:
print(f"Loading listings from {listings_path}...")
listings = _read_listings(listings_path, arcgis_path)
listing_postcodes = (
listings.select("_listing_match_postcode")
.drop_nulls()
.unique()
.to_series()
.to_list()
)
print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}")
print(f"Loading property candidates from {properties_path}...")
property_candidates = _load_property_candidates(properties_path, listing_postcodes)
print(f"Property candidates: {property_candidates.height}")
property_matches = _match_properties(listings, property_candidates)
print(f"Historical property matches: {property_matches.height}")
property_match_frame = _property_match_frame(property_matches, property_candidates)
enriched = _join_postcode_features(listings, postcode_features_path)
if not property_match_frame.is_empty():
enriched = enriched.join(property_match_frame, on="_listing_idx", how="left")
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("Historical property match status")
)
if epc_path is not None:
with tempfile.TemporaryDirectory(
prefix="actual_listing_epc_", dir=local_tmp_dir()
) as tmpdir:
print(f"Loading EPC candidates from {epc_path}...")
epc_candidates = _load_epc_candidates(
epc_path, listing_postcodes, Path(tmpdir)
)
print(f"EPC candidates: {epc_candidates.height}")
epc_matches = _match_epc(listings, epc_candidates)
print(f"EPC matches: {epc_matches.height}")
epc_match_frame = _epc_match_frame(epc_matches, epc_candidates)
if not epc_match_frame.is_empty():
enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left")
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
)
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
)
enriched = _ensure_prefixed_columns(
enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_"
)
enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_")
enriched = _coalesce_feature_columns(enriched)
enriched = _drop_internal_columns(enriched)
output_path.parent.mkdir(parents=True, exist_ok=True)
enriched.write_parquet(output_path)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(
f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)"
)
return enriched
def main() -> None:
parser = argparse.ArgumentParser(
description="Build a pre-enriched actual-listings parquet for the server"
)
parser.add_argument(
"--listings",
type=Path,
default=Path("finder/data/online_listings_buy.parquet"),
help="Input scraped listings parquet",
)
parser.add_argument(
"--properties",
type=Path,
default=Path("property-data/properties.parquet"),
help="Historical properties parquet",
)
parser.add_argument(
"--postcode-features",
type=Path,
default=Path("property-data/postcode.parquet"),
help="Postcode feature parquet",
)
parser.add_argument(
"--arcgis",
type=Path,
default=Path("property-data/arcgis_data.parquet"),
help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping",
)
parser.add_argument(
"--epc",
type=Path,
default=Path("manual-data/domestic-csv.zip"),
help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching",
)
parser.add_argument(
"--no-epc",
action="store_true",
help="Skip direct EPC matching even when --epc exists",
)
parser.add_argument(
"--output",
type=Path,
default=Path("finder/data/online_listings_buy_enriched.parquet"),
help="Output enriched listings parquet",
)
args = parser.parse_args()
epc_path = None if args.no_epc else args.epc
if epc_path is not None and not epc_path.exists():
print(
f"EPC source not found at {epc_path}; continuing without direct EPC matching"
)
epc_path = None
build_enriched_actual_listings(
listings_path=args.listings,
properties_path=args.properties,
postcode_features_path=args.postcode_features,
arcgis_path=args.arcgis,
epc_path=epc_path,
output_path=args.output,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

View file

@ -53,7 +53,7 @@ Build an STRtree spatial index over the INSPIRE candidate polygons. Convert all
For each INSPIRE parcel that contains at least one UPRN, run a majority vote: whichever postcode has the most UPRNs inside that parcel wins the parcel. Accumulate winning parcels per postcode, union them, and clip to the OA boundary. The result is `claimed[postcode] = polygon_within_oa`.
Then resolve overlaps: INSPIRE parcels can overlap geographically (digitization overlaps), so two postcodes might claim the same square meters. Walk through the claimed dict in insertion order (the postcode with the most parcel wins gets priority by virtue of appearing first), subtracting the running union from each subsequent postcode's geometry.
For INSPIRE parcels with no contained UPRN, assign the clipped parcel to the nearest UPRN's postcode using the parcel's representative point. These nearest-postcode claims run after contained-UPRN claims, so explicit address-in-parcel evidence keeps priority. Then resolve overlaps: INSPIRE parcels can overlap geographically (digitization overlaps), so two postcodes might claim the same square meters. Walk through claims in priority order, subtracting the running union from each subsequent postcode's geometry.
#### Stage B: Voronoi distribution of remaining area
@ -67,7 +67,7 @@ The Voronoi computation (`voronoi.py`):
5. For each real point's Voronoi cell, constructs the polygon from the Voronoi vertices, clips to the boundary, groups by postcode
6. Unions per-postcode fragments
The effect: every unclaimed patch of OA gets assigned to the nearest postcode by straight-line distance (Voronoi tessellation is exactly the set of all points nearest to each generator).
The effect: every non-parcel patch of OA gets assigned to the nearest postcode by straight-line distance (Voronoi tessellation is exactly the set of all points nearest to each generator).
#### Stage C: Combine
@ -77,7 +77,7 @@ The output of `process_oa` is `list[(postcode, polygon)]` — the per-OA fragmen
### Phase 4: Merging and writing
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 1m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
**Fragment merging** (`output.py:merge_fragments`): Groups all fragments by postcode, unions them. If the result is a MultiPolygon (meaning the postcode has disconnected pieces — either from spanning OAs with a gap, or algorithm artifacts), applies a 5m buffer-then-unbuffer to close tiny gaps from floating-point mismatches at OA boundary edges. If still a MultiPolygon after that, keeps only the largest polygon — postcodes are contiguous delivery routes, so detached fragments are artifacts.
**GeoJSON output** (`output.py:write_district_geojson`): Groups postcodes by district (the outward code, e.g. `SW1A` from `SW1A 1AA`). For each district, converts every postcode polygon from BNG to WGS84 using pyproj, simplifies with 1m tolerance (Douglas-Peucker), rounds coordinates to 6 decimal places (~0.1m precision), and writes a single `{district}.geojson` FeatureCollection. Each Feature has `postcodes` (formatted like `"SW1A 1AA"`) and `mapit_code` (no space: `"SW1A1AA"`) in its properties.

View file

@ -8,9 +8,10 @@ Algorithm per OA:
1. Single-postcode OA entire OA polygon assigned to that postcode
2. Multi-postcode OA:
a. Assign INSPIRE parcels to postcodes via UPRN point-in-polygon majority vote
b. Union INSPIRE parcels per postcode, clip to OA "claimed" area
c. Distribute remaining (unclaimed) OA area via Voronoi of UPRN points
d. Final polygon = claimed + Voronoi share
b. Assign INSPIRE parcels with no contained UPRN to the nearest UPRN postcode
c. Union parcel claims per postcode, clip to OA "claimed" area
d. Distribute remaining non-parcel OA area via Voronoi of UPRN points
e. Final polygon = parcel claims + Voronoi share
Memory-efficient design (<12GB total):
- INSPIRE polygons stored as raw coordinate bytes in parquet; Shapely objects built

View file

@ -1,12 +1,15 @@
from collections import Counter, defaultdict
import numpy as np
from scipy.spatial import cKDTree
from shapely import STRtree, make_valid
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union
from .voronoi import compute_voronoi_regions
MIN_GEOM_AREA = 0.01
def process_oa(
oa_geom: Polygon | MultiPolygon,
@ -19,76 +22,31 @@ def process_oa(
if len(unique_pcs) == 1:
return [(next(iter(unique_pcs)), oa_geom)]
# Try INSPIRE-based assignment
claimed: dict[str, Polygon | MultiPolygon] = {}
if len(points) == 0:
return []
valid_oa = _clean_polygonal(oa_geom)
if valid_oa is None:
return []
if inspire_candidates:
cand_tree = STRtree(inspire_candidates)
from shapely import points as shp_points
uprn_pts = shp_points(points)
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="intersects")
# Majority vote per candidate polygon
cand_postcodes: dict[int, list[str]] = defaultdict(list)
for pi, ci in zip(pt_idx, cand_idx):
cand_postcodes[ci].append(postcodes[pi])
pc_inspire_polys: dict[str, list[Polygon]] = defaultdict(list)
for ci, pc_list in cand_postcodes.items():
winner = Counter(pc_list).most_common(1)[0][0]
pc_inspire_polys[winner].append(inspire_candidates[ci])
for pc, polys in pc_inspire_polys.items():
merged = unary_union(polys)
if not merged.is_valid:
merged = make_valid(merged)
valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
clipped = merged.intersection(valid_oa)
if not clipped.is_empty:
if not clipped.is_valid:
clipped = make_valid(clipped)
clipped = _extract_polygonal(clipped)
if clipped is not None:
claimed[pc] = clipped
# Resolve overlaps: INSPIRE parcels can overlap geographically, so two
# postcodes may claim the same area. Give contested area to whichever
# postcode claimed it first (most UPRNs → first in insertion order).
if len(claimed) > 1:
resolved: dict[str, Polygon | MultiPolygon] = {}
used = None
for pc, geom in claimed.items():
if used is not None:
if not geom.is_valid:
geom = make_valid(geom)
if not used.is_valid:
used = make_valid(used)
geom = geom.difference(used)
if geom.is_empty:
continue
geom = _extract_polygonal(geom)
if geom is None:
continue
resolved[pc] = geom
used = geom if used is None else unary_union([used, geom])
claimed = resolved
claimed = _claim_inspire_parcels(valid_oa, points, postcodes, inspire_candidates)
else:
claimed = {}
# Compute remaining area
if claimed:
all_claimed = unary_union(list(claimed.values()))
if not all_claimed.is_valid:
all_claimed = make_valid(all_claimed)
valid_oa = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
remaining = valid_oa.difference(all_claimed)
if not remaining.is_valid:
remaining = make_valid(remaining)
all_claimed = _clean_polygonal(all_claimed)
remaining = (
valid_oa.difference(all_claimed) if all_claimed is not None else valid_oa
)
remaining = _clean_polygonal(remaining)
else:
remaining = oa_geom if oa_geom.is_valid else make_valid(oa_geom)
remaining = valid_oa
# Distribute remaining area via Voronoi
if not remaining.is_empty and remaining.area > 0.01:
# Distribute non-parcel land via Voronoi
if remaining is not None and not remaining.is_empty and remaining.area > MIN_GEOM_AREA:
voronoi_result = compute_voronoi_regions(points, postcodes, remaining)
else:
voronoi_result = {}
@ -102,17 +60,167 @@ def process_oa(
fragments = []
for pc, parts in result.items():
merged = unary_union(parts)
if not merged.is_empty:
if not merged.is_valid:
merged = make_valid(merged)
merged = _extract_polygonal(merged)
if merged is not None:
fragments.append((pc, merged))
merged = _clean_polygonal(unary_union(parts))
if merged is not None:
fragments.append((pc, merged))
return fragments
def _claim_inspire_parcels(
valid_oa: Polygon | MultiPolygon,
points: np.ndarray,
postcodes: list[str],
inspire_candidates: list[Polygon],
) -> dict[str, Polygon | MultiPolygon]:
"""Assign INSPIRE parcels to postcodes before Voronoi fills non-parcel land."""
parcels = _prepare_inspire_parcels(valid_oa, inspire_candidates)
if not parcels:
return {}
cand_tree = STRtree(parcels)
from shapely import points as shp_points
uprn_pts = shp_points(points)
pt_idx, cand_idx = cand_tree.query(uprn_pts, predicate="within")
# First priority: parcels that physically contain UPRNs. Majority vote
# resolves blocks of flats or overlapping parcel data.
cand_postcodes: dict[int, list[str]] = defaultdict(list)
for pi, ci in zip(pt_idx, cand_idx):
cand_postcodes[ci].append(postcodes[pi])
contained_parts: dict[str, list] = defaultdict(list)
contained_scores: Counter[str] = Counter()
for ci, pc_list in cand_postcodes.items():
pc_counts = Counter(pc_list)
winner, votes = pc_counts.most_common(1)[0]
contained_parts[winner].append(parcels[ci])
contained_scores[winner] += votes
contained_claimed = _merge_parts_by_postcode(contained_parts)
contained_claims = sorted(
contained_claimed.items(),
key=lambda item: (-contained_scores[item[0]], -item[1].area, item[0]),
)
# Second priority: remaining INSPIRE parcels with no contained UPRN. Assign
# each to the nearest UPRN/postcode so parcel boundaries carry more of the
# visible postcode shape; Voronoi is then limited to roads, parks, water, and
# any other non-parcel gaps.
points_f64 = points.astype(np.float64, copy=False)
contained_union = _union_claims(contained_claims)
nearest_tree = cKDTree(points_f64)
nearest_parts: dict[str, list] = defaultdict(list)
for i, parcel in enumerate(parcels):
if i in cand_postcodes:
continue
assignable = parcel
if contained_union is not None:
assignable = assignable.difference(contained_union)
for part in _polygon_parts(assignable):
part = _clean_polygonal(part)
if part is None:
continue
pc = _nearest_postcode(part, nearest_tree, postcodes)
nearest_parts[pc].append(part)
nearest_claimed = _merge_parts_by_postcode(nearest_parts)
nearest_claims = sorted(
nearest_claimed.items(),
key=lambda item: (-item[1].area, item[0]),
)
return _resolve_ordered_claims(contained_claims + nearest_claims)
def _prepare_inspire_parcels(
valid_oa: Polygon | MultiPolygon,
inspire_candidates: list[Polygon],
) -> list[Polygon | MultiPolygon]:
parcels: list[Polygon | MultiPolygon] = []
for candidate in inspire_candidates:
geom = _clean_polygonal(candidate)
if geom is None:
continue
if not geom.intersects(valid_oa):
continue
clipped = _clean_polygonal(geom.intersection(valid_oa))
if clipped is not None:
parcels.append(clipped)
return parcels
def _nearest_postcode(
geom: Polygon | MultiPolygon,
tree: cKDTree,
postcodes: list[str],
) -> str:
point = geom.representative_point()
_, idx = tree.query([point.x, point.y])
return postcodes[idx]
def _polygon_parts(geom) -> list[Polygon]:
geom = _clean_polygonal(geom)
if geom is None:
return []
if geom.geom_type == "Polygon":
return [geom]
return list(geom.geoms)
def _merge_parts_by_postcode(
parts_by_postcode: dict[str, list],
) -> dict[str, Polygon | MultiPolygon]:
merged: dict[str, Polygon | MultiPolygon] = {}
for pc, parts in parts_by_postcode.items():
geom = _clean_polygonal(unary_union(parts))
if geom is not None:
merged[pc] = geom
return merged
def _union_claims(
claims: list[tuple[str, Polygon | MultiPolygon]],
) -> Polygon | MultiPolygon | None:
if not claims:
return None
return _clean_polygonal(unary_union([geom for _, geom in claims]))
def _resolve_ordered_claims(
claims: list[tuple[str, Polygon | MultiPolygon]],
) -> dict[str, Polygon | MultiPolygon]:
"""Resolve overlapping parcel claims in priority order."""
resolved_parts: dict[str, list] = defaultdict(list)
used = None
for pc, geom in claims:
geom = _clean_polygonal(geom)
if geom is None:
continue
if used is not None:
geom = _clean_polygonal(geom.difference(used))
if geom is None:
continue
resolved_parts[pc].append(geom)
used = _clean_polygonal(geom if used is None else unary_union([used, geom]))
return _merge_parts_by_postcode(resolved_parts)
def _clean_polygonal(geom) -> Polygon | MultiPolygon | None:
if geom is None or geom.is_empty:
return None
if not geom.is_valid:
geom = make_valid(geom)
geom = _extract_polygonal(geom)
if geom is None or geom.is_empty or geom.area <= MIN_GEOM_AREA:
return None
return geom
def _extract_polygonal(geom) -> Polygon | MultiPolygon | None:
"""Extract only Polygon/MultiPolygon parts from a geometry.

View file

@ -7,6 +7,7 @@ import numpy as np
import polars as pl
import pytest
from shapely.geometry import MultiPolygon, Polygon, box
from shapely.ops import unary_union
from .oa_boundaries import parse_gpkg_geometry
from .greenspace import subtract_greenspace
@ -215,6 +216,20 @@ class TestVoronoiCollinear:
assert ratio > 0.3, f"Area split too unfair: {area_a:.0f} vs {area_b:.0f}"
class TestVoronoiCoverage:
"""Voronoi fallback should cover large OAs even when UPRNs are clustered."""
def test_clustered_points_cover_large_boundary(self):
boundary = box(0, 0, 5000, 100)
points = np.array([[10, 50], [20, 50]])
result = compute_voronoi_regions(points, ["A", "B"], boundary)
covered = unary_union(list(result.values()))
assert covered.area == pytest.approx(boundary.area)
assert boundary.difference(covered).area < 0.01
class TestEqualSplitFallback:
"""_equal_split_fallback must give every postcode some area."""
@ -306,6 +321,186 @@ class TestProcessOAGeometryTypes:
)
class TestProcessOAInspireParcelAssignment:
"""INSPIRE parcels without UPRNs should still shape postcode boundaries."""
def test_unoccupied_inspire_parcel_goes_to_nearest_postcode(self):
"""A parcel with no contained UPRN should not be split by Voronoi."""
oa_geom = box(0, 0, 100, 100)
parcel = box(20, 40, 65, 60) # crosses the x=50 Voronoi split
points = np.array(
[
[10, 50], # postcode A
[90, 50], # postcode B
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[parcel])
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert parcel.difference(frag_dict["A"]).area < 0.01
assert frag_dict["B"].intersection(parcel).area < 0.01
def test_contained_uprn_claim_wins_over_overlapping_nearest_parcel(self):
"""Contained-UPRN parcel claims should keep priority over nearest claims."""
oa_geom = box(0, 0, 100, 100)
contained_a = box(0, 0, 60, 100)
unoccupied_nearer_b = box(50, 0, 80, 100)
points = np.array(
[
[20, 50], # postcode A, inside contained_a
[90, 50], # postcode B, outside unoccupied_nearer_b
]
)
postcodes = ["A", "B"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_a, unoccupied_nearer_b],
)
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert contained_a.difference(frag_dict["A"]).area < 0.01
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
assert frag_dict["B"].intersection(box(60, 0, 80, 100)).area > 0
def test_nearest_uses_assignable_fragment_after_contained_subtraction(self):
"""Nearest assignment should use the part left after priority subtraction."""
oa_geom = box(0, 0, 100, 100)
contained_a = box(0, 0, 60, 100)
unoccupied = box(25, 0, 80, 100)
points = np.array(
[
[20, 50], # postcode A, inside contained_a
[90, 50], # postcode B, nearest to unoccupied remainder
]
)
postcodes = ["A", "B"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_a, unoccupied],
)
frag_dict = dict(fragments)
assert contained_a.difference(frag_dict["A"]).area < 0.01
assert box(60, 0, 80, 100).difference(frag_dict["B"]).area < 0.01
def test_boundary_uprn_does_not_claim_adjacent_parcel(self):
"""A UPRN on a parcel edge should not count inside both parcels."""
oa_geom = box(0, 0, 100, 100)
left = box(0, 0, 50, 100)
right = box(50, 0, 100, 100)
points = np.array(
[
[50, 50], # postcode A, exactly on shared parcel boundary
[75, 50], # postcode B, strictly inside right parcel
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert right.difference(frag_dict["B"]).area < 0.01
def test_disconnected_nearest_fragments_can_go_to_different_postcodes(self):
"""A split unoccupied parcel should be assigned component by component."""
oa_geom = box(0, 0, 100, 100)
contained_b = box(40, 0, 60, 100)
unoccupied = box(0, 40, 100, 60)
points = np.array(
[
[10, 20], # postcode A, nearest to left split fragment
[50, 20], # postcode B, inside contained_b but outside unoccupied
[90, 20], # postcode C, nearest to right split fragment
]
)
postcodes = ["A", "B", "C"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_b, unoccupied],
)
frag_dict = dict(fragments)
assert box(0, 40, 40, 60).difference(frag_dict["A"]).area < 0.01
assert box(60, 40, 100, 60).difference(frag_dict["C"]).area < 0.01
def test_overlapping_nearest_parcels_do_not_overlap_in_output(self):
"""Two unoccupied nearest-assigned parcels should be resolved cleanly."""
oa_geom = box(0, 0, 100, 100)
left = box(0, 0, 70, 100)
right = box(30, 0, 100, 100)
points = np.array(
[
[10, 50], # postcode A, nearest to left parcel
[90, 50], # postcode B, nearest to right parcel
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[left, right])
frag_dict = dict(fragments)
assert "A" in frag_dict and "B" in frag_dict
assert frag_dict["A"].intersection(frag_dict["B"]).area < 0.01
def test_mixed_inspire_and_voronoi_covers_oa_without_overlap(self):
"""Parcel claims plus Voronoi fallback should cover the whole OA."""
oa_geom = box(0, 0, 100, 100)
contained_a = box(0, 0, 30, 100)
unoccupied = box(70, 0, 90, 100)
points = np.array(
[
[10, 50],
[90, 50],
]
)
postcodes = ["A", "B"]
fragments = process_oa(
oa_geom,
points,
postcodes,
inspire_candidates=[contained_a, unoccupied],
)
geoms = [geom for _, geom in fragments]
covered = unary_union(geoms)
overlap = sum(geom.area for geom in geoms) - covered.area
assert covered.area == pytest.approx(oa_geom.area)
assert oa_geom.difference(covered).area < 0.01
assert overlap < 0.01
def test_inspire_parcel_straddling_oa_is_clipped(self):
"""INSPIRE parcels crossing the OA boundary should not leak outside it."""
oa_geom = box(0, 0, 100, 100)
straddling = box(80, 0, 140, 100)
points = np.array(
[
[10, 50],
[90, 50],
]
)
postcodes = ["A", "B"]
fragments = process_oa(oa_geom, points, postcodes, inspire_candidates=[straddling])
for _, geom in fragments:
assert geom.difference(oa_geom).area < 0.01
# ---------------------------------------------------------------------------
# _extract_polygonal helper
# ---------------------------------------------------------------------------

View file

@ -52,9 +52,17 @@ def compute_voronoi_regions(
if len(unique_pts) == 1:
return {unique_pcs[0]: boundary}
if not boundary.is_valid:
boundary = make_valid(boundary)
pts = np.array(unique_pts)
min_e, min_n = pts.min(axis=0)
max_e, max_n = pts.max(axis=0)
pts_min_e, pts_min_n = pts.min(axis=0)
pts_max_e, pts_max_n = pts.max(axis=0)
boundary_min_e, boundary_min_n, boundary_max_e, boundary_max_n = boundary.bounds
min_e = min(pts_min_e, boundary_min_e)
min_n = min(pts_min_n, boundary_min_n)
max_e = max(pts_max_e, boundary_max_e)
max_n = max(pts_max_n, boundary_max_n)
span = max(max_e - min_e, max_n - min_n, 100)
dummy = np.array(
@ -79,9 +87,6 @@ def compute_voronoi_regions(
n_real = len(pts)
pc_polys: dict[str, list[Polygon]] = defaultdict(list)
if not boundary.is_valid:
boundary = make_valid(boundary)
for i in range(n_real):
region_idx = vor.point_region[i]
region = vor.regions[region_idx]

View file

@ -1,143 +0,0 @@
from pathlib import Path
import polars as pl
from pipeline.transform.enrich_actual_listings import build_enriched_actual_listings
def test_build_enriched_actual_listings_joins_postcode_and_property_features(
tmp_path: Path,
) -> None:
listings_path = tmp_path / "listings.parquet"
properties_path = tmp_path / "properties.parquet"
postcode_path = tmp_path / "postcode.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
output_path = tmp_path / "online_listings_buy_enriched.parquet"
pl.DataFrame(
{
"Bedrooms": [2],
"Bathrooms": [1],
"Number of bedrooms & living rooms": [3],
"lon": [-0.1],
"lat": [51.5],
"Postcode": ["AA1 1AB"],
"Address per Property Register": ["1 High Street"],
"Leasehold/Freehold": [None],
"Property type": ["Terraced"],
"Property sub-type": ["Terraced"],
"Price qualifier": [""],
"Total floor area (sqm)": [None],
"Listing URL": ["https://example.test/listing"],
"Listing features": [["Garden"]],
"Listing date": [None],
"Listing status": ["For sale"],
"Asking price": [300_000],
"Asking price per sqm": [None],
},
schema={
"Bedrooms": pl.Int32,
"Bathrooms": pl.Int32,
"Number of bedrooms & living rooms": pl.Int32,
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,
"Price qualifier": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Listing URL": pl.Utf8,
"Listing features": pl.List(pl.Utf8),
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
"Asking price per sqm": pl.Int32,
},
).write_parquet(listings_path)
pl.DataFrame(
{
"Address per Property Register": ["1 HIGH STREET"],
"Postcode": ["AA1 1AA"],
"Leasehold/Freehold": ["Freehold"],
"Address per EPC": ["1 High Street"],
"Current energy rating": ["C"],
"Potential energy rating": ["B"],
"Total floor area (sqm)": [80.0],
"Number of bedrooms & living rooms": [4],
"Interior height (m)": [2.4],
"Construction year": [1935],
"Former council house": ["No"],
"Listed building": ["No"],
"Estimated monthly rent": [1200.0],
"Street tree density percentile": [75.0],
"Property type": ["Terraced"],
"Estimated current price": [310_000.0],
},
schema={
"Address per Property Register": pl.Utf8,
"Postcode": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Address per EPC": pl.Utf8,
"Current energy rating": pl.Utf8,
"Potential energy rating": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int32,
"Interior height (m)": pl.Float64,
"Construction year": pl.UInt16,
"Former council house": pl.Utf8,
"Listed building": pl.Utf8,
"Estimated monthly rent": pl.Float32,
"Street tree density percentile": pl.Float32,
"Property type": pl.Utf8,
"Estimated current price": pl.Float64,
},
).write_parquet(properties_path)
pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Income Score": [82.5],
"Within conservation area": ["Yes"],
}
).write_parquet(postcode_path)
pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"ctry25cd": ["E92000001", "E92000001"],
"doterm": [None, "202401"],
"east1m": [100.0, 105.0],
"north1m": [100.0, 105.0],
},
schema={
"pcds": pl.Utf8,
"ctry25cd": pl.Utf8,
"doterm": pl.Utf8,
"east1m": pl.Float64,
"north1m": pl.Float64,
},
).write_parquet(arcgis_path)
result = build_enriched_actual_listings(
listings_path=listings_path,
properties_path=properties_path,
postcode_features_path=postcode_path,
arcgis_path=arcgis_path,
output_path=output_path,
epc_path=None,
)
row = result.row(0, named=True)
assert output_path.exists()
assert row["Postcode"] == "AA1 1AA"
assert row["Historical property match status"] == "matched"
assert row["Income Score"] == 82.5
assert row["Within conservation area"] == "Yes"
assert row["Leasehold/Freehold"] == "Freehold"
assert row["Total floor area (sqm)"] == 80.0
assert row["Asking price per sqm"] == 3750
assert row["Estimated current price"] == 300_000
assert row["Current energy rating"] == "C"

View file

@ -2,16 +2,23 @@ import polars as pl
import pyarrow as pa
import pytest
from shapely import box, to_wkb
from shapely.geometry import Point
from pipeline.transform.merge import (
_AREA_COLUMNS,
CONSERVATION_AREA_FEATURE,
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_is_unpublished_conservation_area_record,
_LISTING_OVERLAY_SOURCES,
_build_unmatched_listing_seed_rows,
_canonical_postcode_expr,
_finalize_listings,
_integrate_listings,
_match_direct_epc,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_load_conservation_area_geometries,
_load_listings_for_merge,
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
@ -85,31 +92,28 @@ def test_postcode_conservation_area_flags_marks_point_membership() -> None:
]
def test_unpublished_conservation_area_records_are_identified() -> None:
assert _is_unpublished_conservation_area_record(
"No data available for publication by HE"
)
assert not _is_unpublished_conservation_area_record("Bloomsbury")
assert not _is_unpublished_conservation_area_record(None)
def test_load_conservation_area_geometries_skips_unpublished_placeholders(
def test_load_conservation_area_geometries_uses_current_planning_data_records(
monkeypatch: pytest.MonkeyPatch,
tmp_path,
) -> None:
real_area = box(0, 0, 1, 1)
placeholder_area = box(-100, -100, 100, 100)
ended_area = box(2, 2, 3, 3)
other_dataset_area = box(4, 4, 5, 5)
point = Point(0.5, 0.5)
def fake_read_arrow(path, columns):
assert path == tmp_path / "conservation_areas.gpkg"
assert columns == ["NAME"]
def fake_read_arrow(path):
assert path == tmp_path / "conservation_areas.geojson"
table = pa.table(
{
"NAME": [
"Central Village",
"No data available for publication by HE",
"dataset": [
"conservation-area",
"conservation-area",
"listed-building",
"conservation-area",
],
"SHAPE": to_wkb([real_area, placeholder_area]),
"end-date": ["", "2025-01-01", "", ""],
"name": ["Central Village", "Old Boundary", "Other", "Point Record"],
"SHAPE": to_wkb([real_area, ended_area, other_dataset_area, point]),
}
)
return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
@ -117,7 +121,7 @@ def test_load_conservation_area_geometries_skips_unpublished_placeholders(
monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
geometries, crs = _load_conservation_area_geometries(
tmp_path / "conservation_areas.gpkg"
tmp_path / "conservation_areas.geojson"
)
assert crs == "EPSG:4326"
@ -290,3 +294,440 @@ def test_tree_density_by_postcode_requires_postcode_and_density_columns(
with pytest.raises(ValueError, match="missing required column: postcode"):
_tree_density_by_postcode(missing_postcode_path)
def _sample_listings_frame() -> pl.DataFrame:
return pl.DataFrame(
{
"Bedrooms": [3],
"Bathrooms": [2],
"Number of bedrooms & living rooms": [4],
"lon": [-0.1],
"lat": [51.5],
"Postcode": ["sw1a1aa"],
"Address per Property Register": ["1 Example Road"],
"Leasehold/Freehold": ["Freehold"],
"Property type": ["Terraced"],
"Property sub-type": ["Mid-Terrace"],
"Price qualifier": [""],
"Total floor area (sqm)": [120.0],
"Listing URL": ["https://example.test/abc"],
"Listing features": [["Garden", "Off-street parking"]],
"Listing date": [None],
"Listing status": ["For sale"],
"Asking price": [750_000],
"Asking price per sqm": [6_250],
},
schema={
"Bedrooms": pl.Int32,
"Bathrooms": pl.Int32,
"Number of bedrooms & living rooms": pl.Int32,
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,
"Price qualifier": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Listing URL": pl.Utf8,
"Listing features": pl.List(pl.Utf8),
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
"Asking price per sqm": pl.Int32,
},
)
def _stub_arcgis(path) -> None:
pl.DataFrame(
{
"pcds": ["SW1A 1AA"],
"ctry25cd": ["E92000001"],
"doterm": [None],
"east1m": [530000.0],
"north1m": [180000.0],
},
schema={
"pcds": pl.Utf8,
"ctry25cd": pl.Utf8,
"doterm": pl.Utf8,
"east1m": pl.Float64,
"north1m": pl.Float64,
},
).write_parquet(path)
def test_canonical_postcode_expr_formats_compact_postcodes() -> None:
df = pl.DataFrame({"Postcode": ["sw1a1aa", "SW1A 1AA", "bad", None]})
result = df.with_columns(_canonical_postcode_expr("Postcode").alias("canonical"))
assert result["canonical"].to_list() == ["SW1A 1AA", "SW1A 1AA", None, None]
def test_load_listings_for_merge_canonicalises_and_exposes_overlay_columns(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
loaded = _load_listings_for_merge(listings_path, arcgis_path)
assert loaded["postcode"].to_list() == ["SW1A 1AA"]
assert loaded["pp_address"].to_list() == ["1 Example Road"]
assert loaded["_actual_listing_url"].to_list() == ["https://example.test/abc"]
assert loaded["_actual_asking_price"].to_list() == [750_000]
assert loaded["_actual_lat"].to_list() == [51.5]
def test_build_unmatched_listing_seed_rows_fills_property_shape_fields(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
unmatched_idxs = listings.select("_listing_idx")
seed = _build_unmatched_listing_seed_rows(
unmatched_idxs, listings, template_schema
)
assert seed.height == 1
assert seed["postcode"].to_list() == ["SW1A 1AA"]
assert seed["pp_address"].to_list() == ["1 Example Road"]
assert seed["pp_property_type"].to_list() == ["Terraced"]
assert seed["duration"].to_list() == ["Freehold"]
assert seed["total_floor_area"].to_list() == [120.0]
assert seed["number_habitable_rooms"].to_list() == [4]
assert seed["latest_price"].to_list() == [750_000]
# Columns not populated from the listing default to null.
assert seed["epc_address"].to_list() == [None]
# Overlay columns flow through 1:1.
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_build_unmatched_listing_seed_rows_uses_direct_epc_fallbacks(
tmp_path,
) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit(None, dtype=pl.Float64).alias("Total floor area (sqm)"),
pl.lit(None, dtype=pl.Int32).alias("Number of bedrooms & living rooms"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
listings = _load_listings_for_merge(listings_path, arcgis_path).with_columns(
pl.lit("1 Example Road").alias("_direct_epc_address"),
pl.lit("C").alias("_direct_current_energy_rating"),
pl.lit("B").alias("_direct_potential_energy_rating"),
pl.lit(98.0).alias("_direct_total_floor_area"),
pl.lit(4, dtype=pl.Int16).alias("_direct_number_habitable_rooms"),
pl.lit(2.4).alias("_direct_floor_height"),
pl.lit(1930, dtype=pl.UInt16).alias("_direct_construction_age_band"),
pl.lit(1, dtype=pl.UInt8).alias("_direct_is_construction_date_approximate"),
pl.lit("No").alias("_direct_was_council_house"),
)
template_schema = pl.Schema(
{
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"was_council_house": pl.Utf8,
**{dst: dtype for _src, dst, dtype in _LISTING_OVERLAY_SOURCES},
}
)
seed = _build_unmatched_listing_seed_rows(
listings.select("_listing_idx"), listings, template_schema
)
assert seed["total_floor_area"].to_list() == [98.0]
assert seed["number_habitable_rooms"].to_list() == [4]
assert seed["epc_address"].to_list() == ["1 Example Road"]
assert seed["current_energy_rating"].to_list() == ["C"]
assert seed["was_council_house"].to_list() == ["No"]
def test_match_direct_epc_considers_nearby_postcodes() -> None:
listing_matches = pl.DataFrame(
{
"_listing_idx": [0],
"_listing_match_address": ["1 EXAMPLE ROAD"],
"_listing_match_postcode": ["AA11AA"],
"_listing_east": [1000.0],
"_listing_north": [1000.0],
"_actual_property_type": ["Terraced"],
"_actual_total_floor_area": [100.0],
"_actual_number_habitable_rooms": [4],
},
schema={
"_listing_idx": pl.UInt32,
"_listing_match_address": pl.Utf8,
"_listing_match_postcode": pl.Utf8,
"_listing_east": pl.Float64,
"_listing_north": pl.Float64,
"_actual_property_type": pl.Utf8,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
},
)
epc_candidates = pl.DataFrame(
{
"_direct_epc_row": [0],
"_direct_epc_match_address": ["1 EXAMPLE ROAD"],
"_direct_epc_match_postcode": ["BB11BB"],
"_direct_epc_east": [1020.0],
"_direct_epc_north": [1010.0],
"_direct_epc_canonical_property_type": ["Terraced"],
"_direct_epc_address": ["1, Example Road"],
"_direct_current_energy_rating": ["C"],
"_direct_potential_energy_rating": ["B"],
"_direct_total_floor_area": [101.0],
"_direct_number_habitable_rooms": [4],
"_direct_floor_height": [2.5],
"_direct_construction_age_band": [1930],
"_direct_is_construction_date_approximate": [1],
"_direct_was_council_house": ["No"],
},
schema={
"_direct_epc_row": pl.UInt32,
"_direct_epc_match_address": pl.Utf8,
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_east": pl.Float64,
"_direct_epc_north": pl.Float64,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_address": pl.Utf8,
"_direct_current_energy_rating": pl.Utf8,
"_direct_potential_energy_rating": pl.Utf8,
"_direct_total_floor_area": pl.Float64,
"_direct_number_habitable_rooms": pl.Int16,
"_direct_floor_height": pl.Float64,
"_direct_construction_age_band": pl.UInt16,
"_direct_is_construction_date_approximate": pl.UInt8,
"_direct_was_council_house": pl.Utf8,
},
)
matches = _match_direct_epc(listing_matches, epc_candidates)
assert matches.height == 1
assert matches["_listing_idx"].to_list() == [0]
assert matches["_direct_epc_address"].to_list() == ["1, Example Road"]
def test_integrate_listings_attaches_overlay_by_matched_property_key(tmp_path) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA", "SW1A 1AA"],
"pp_address": ["9 Other Road", "1 Example Road"],
"pp_property_type": ["Detached", "Terraced"],
"duration": ["Freehold", "Freehold"],
"total_floor_area": [80.0, 90.0],
"number_habitable_rooms": [3, 4],
"latest_price": [500_000, 600_000],
"epc_address": [None, "1 Example Road"],
"current_energy_rating": [None, "C"],
"potential_energy_rating": [None, "B"],
"floor_height": [None, 2.4],
"construction_age_band": [None, 1930],
"is_construction_date_approximate": [None, 1],
"was_council_house": [None, "No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
matched = integrated.filter(pl.col("pp_address") == "1 Example Road")
other = integrated.filter(pl.col("pp_address") == "9 Other Road")
assert matched["_actual_listing_url"].to_list() == ["https://example.test/abc"]
assert other["_actual_listing_url"].to_list() == [None]
def test_integrate_listings_rejects_low_confidence_no_number_match(tmp_path) -> None:
listings_path = tmp_path / "listings.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
_sample_listings_frame().with_columns(
pl.lit("Rose Cottage High Street").alias("Address per Property Register"),
).write_parquet(listings_path)
_stub_arcgis(arcgis_path)
wide = pl.DataFrame(
{
"postcode": ["SW1A 1AA"],
"pp_address": ["Old Cottage High Street"],
"pp_property_type": ["Terraced"],
"duration": ["Freehold"],
"total_floor_area": [120.0],
"number_habitable_rooms": [4],
"latest_price": [750_000],
"epc_address": ["Old Cottage High Street"],
"current_energy_rating": ["C"],
"potential_energy_rating": ["B"],
"floor_height": [2.4],
"construction_age_band": [1930],
"is_construction_date_approximate": [1],
"was_council_house": ["No"],
},
schema={
"postcode": pl.Utf8,
"pp_address": pl.Utf8,
"pp_property_type": pl.Utf8,
"duration": pl.Utf8,
"total_floor_area": pl.Float64,
"number_habitable_rooms": pl.Int16,
"latest_price": pl.Int64,
"epc_address": pl.Utf8,
"current_energy_rating": pl.Utf8,
"potential_energy_rating": pl.Utf8,
"floor_height": pl.Float64,
"construction_age_band": pl.UInt16,
"is_construction_date_approximate": pl.UInt8,
"was_council_house": pl.Utf8,
},
)
integrated = _integrate_listings(
wide.lazy(), listings_path, arcgis_path, epc_path=None
).collect()
existing = integrated.filter(pl.col("pp_address") == "Old Cottage High Street")
seed = integrated.filter(pl.col("pp_address") == "Rose Cottage High Street")
assert existing["_actual_listing_url"].to_list() == [None]
assert seed["_actual_listing_url"].to_list() == ["https://example.test/abc"]
def test_finalize_listings_promotes_overlay_columns_and_filters_to_listing_rows() -> (
None
):
df = pl.DataFrame(
{
"Postcode": ["SW1A 1AA", "SW1A 1AA"],
"Address per Property Register": ["1 Example Road", "2 Example Road"],
"Address per EPC": ["1 Example Road", None],
"Date of last transaction": [1990.0, None],
"lat": [51.5, 51.5],
"lon": [-0.1, -0.1],
"Total floor area (sqm)": [100.0, 95.0],
"Number of bedrooms & living rooms": [3, None],
"Property type": ["Terraced", None],
"Leasehold/Freehold": ["Leasehold", None],
"Last known price": [500_000, None],
"Street tree density percentile": [42.0, 42.0],
# Overlay columns: row 0 is a matched listing, row 1 is unmatched, row none.
"_actual_listing_url": ["url0", "url1"],
"_actual_asking_price": [600_000, 700_000],
"_actual_asking_price_per_sqm": [5_000, None],
"_actual_listing_date": [None, None],
"_actual_listing_status": ["For sale", "For sale"],
"_actual_listing_features": [["Garden"], ["Parking"]],
"_actual_bedrooms": [3, 4],
"_actual_bathrooms": [1, 2],
"_actual_price_qualifier": ["", ""],
"_actual_property_sub_type": ["Mid-Terrace", "End-Terrace"],
"_actual_lat": [51.51, 51.52],
"_actual_lon": [-0.11, -0.12],
"_actual_total_floor_area": [110.0, None],
"_actual_number_habitable_rooms": [4, 3],
"_actual_property_type": ["Terraced", "Flats/Maisonettes"],
"_actual_leasehold_freehold": ["Freehold", "Leasehold"],
},
schema={
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Address per EPC": pl.Utf8,
"Date of last transaction": pl.Float64,
"lat": pl.Float64,
"lon": pl.Float64,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int16,
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Last known price": pl.Int64,
"Street tree density percentile": pl.Float32,
"_actual_listing_url": pl.Utf8,
"_actual_asking_price": pl.Int64,
"_actual_asking_price_per_sqm": pl.Int32,
"_actual_listing_date": pl.Datetime("us"),
"_actual_listing_status": pl.Utf8,
"_actual_listing_features": pl.List(pl.Utf8),
"_actual_bedrooms": pl.Int32,
"_actual_bathrooms": pl.Int32,
"_actual_price_qualifier": pl.Utf8,
"_actual_property_sub_type": pl.Utf8,
"_actual_lat": pl.Float64,
"_actual_lon": pl.Float64,
"_actual_total_floor_area": pl.Float64,
"_actual_number_habitable_rooms": pl.Int16,
"_actual_property_type": pl.Utf8,
"_actual_leasehold_freehold": pl.Utf8,
},
)
finalized = _finalize_listings(df).sort("Address per Property Register")
assert finalized.height == 2
assert finalized["Listing URL"].to_list() == ["url0", "url1"]
assert finalized["Asking price"].to_list() == [600_000, 700_000]
assert finalized["Asking price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Est. price per sqm"].to_list() == [5_000, 7_368]
assert finalized["Estimated current price"].to_list() == [600_000, 700_000]
assert finalized["Last known price"].to_list() == [500_000, 700_000]
# Listing's preferred floor area / rooms / property type / tenure.
assert finalized["Total floor area (sqm)"].to_list() == [110.0, 95.0]
assert finalized["Number of bedrooms & living rooms"].to_list() == [4, 3]
assert finalized["Property type"].to_list() == ["Terraced", "Flats/Maisonettes"]
assert finalized["Leasehold/Freehold"].to_list() == ["Freehold", "Leasehold"]
# Postcode-level feature carried through to both matched and unmatched rows.
assert finalized["Street tree density percentile"].to_list() == [42.0, 42.0]
# Match status reflects historical context availability.
assert finalized["Historical property match status"].to_list() == [
"matched",
"unmatched",
]
# Overlay scaffolding is dropped.
for src, dst, _dt in _LISTING_OVERLAY_SOURCES:
assert dst not in finalized.columns, src