This commit is contained in:
Andras Schmelczer 2026-05-26 19:45:13 +01:00
parent c645b0f1d4
commit 39ef5c6646
79 changed files with 5660 additions and 2199 deletions

View file

@ -10,7 +10,11 @@ from pathlib import Path
from PIL import Image, ImageDraw
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
from pipeline.transform.transform_poi import (
NAPTAN_EMOJIS,
SCHOOL_ICON_CATEGORIES,
_CATEGORIES,
)
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
@ -109,6 +113,9 @@ def collect_twemoji_codes() -> list[str]:
for emoji in NAPTAN_EMOJIS.values():
emojis.add(emoji)
for emoji in SCHOOL_ICON_CATEGORIES.values():
emojis.add(emoji)
# First codepoint hex, matching frontend logic
return sorted({f"{ord(e[0]):x}" for e in emojis})

View file

@ -124,6 +124,8 @@ def build_crime_hotspot_tiles(
str(max_zoom),
"--drop-densest-as-needed",
"--extend-zooms-if-still-dropping",
"--temporary-directory",
tmp,
str(ndjson_path),
],
check=True,

View file

@ -0,0 +1,960 @@
import argparse
import re
import tempfile
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10.0
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
PROPERTY_MATCH_MIN_MARGIN = 4.0
EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
EPC_MATCH_MIN_MARGIN = 4.0
ENRICHMENT_VERSION = 1
_NUMBER_RE = re.compile(r"\d+")
LISTING_REQUIRED_COLUMNS = [
"Bedrooms",
"Bathrooms",
"Number of bedrooms & living rooms",
"lon",
"lat",
"Postcode",
"Address per Property Register",
"Leasehold/Freehold",
"Property type",
"Property sub-type",
"Price qualifier",
"Total floor area (sqm)",
"Listing URL",
"Listing features",
"Listing date",
"Listing status",
"Asking price",
"Asking price per sqm",
]
PROPERTY_CANDIDATE_COLUMNS = [
"Address per Property Register",
"Postcode",
"Leasehold/Freehold",
"Last known price",
"Date of last transaction",
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Interior height (m)",
"Construction year",
"Former council house",
"Is construction date approximate",
"Listed building",
"Estimated monthly rent",
"Street tree density percentile",
"Property type",
"Price per sqm",
"Estimated current price",
"Est. price per sqm",
]
PROPERTY_ENRICHMENT_COLUMNS = [
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Interior height (m)",
"Construction year",
"Former council house",
"Is construction date approximate",
"Listed building",
"Estimated monthly rent",
"Street tree density percentile",
"Date of last transaction",
]
EPC_ENRICHMENT_COLUMNS = [
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Interior height (m)",
"Construction year",
"Former council house",
]
EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
TENURE_VALUES = ["Freehold", "Leasehold"]
PROPERTY_TYPE_VALUES = [
"Detached",
"Semi-Detached",
"Terraced",
"Flats/Maisonettes",
"Other",
]
COLUMN_DTYPES = {
"Address per EPC": pl.Utf8,
"Current energy rating": pl.Utf8,
"Potential energy rating": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int32,
"Interior height (m)": pl.Float64,
"Construction year": pl.UInt16,
"Former council house": pl.Utf8,
"Is construction date approximate": pl.UInt8,
"Listed building": pl.Utf8,
"Estimated monthly rent": pl.Float32,
"Street tree density percentile": pl.Float32,
"Date of last transaction": pl.Datetime("us"),
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
}
def _canonical_postcode_expr(column: str) -> pl.Expr:
compact = (
pl.col(column)
.cast(pl.Utf8)
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", "")
.str.strip_chars()
)
return (
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
.otherwise(None)
)
def _clean_string_expr(column: str) -> pl.Expr:
stripped = pl.col(column).cast(pl.Utf8).str.strip_chars()
return pl.when(stripped == "").then(None).otherwise(stripped)
def _coalesce_non_empty(*columns: str) -> pl.Expr:
return pl.coalesce(
[
pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "")
.then(None)
.otherwise(pl.col(column).cast(pl.Utf8))
for column in columns
]
)
def _valid_number_expr(column: str) -> pl.Expr:
return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None)
def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
schema = pl.scan_parquet(listings_path).collect_schema()
missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names()))
if missing:
raise ValueError(f"{listings_path} is missing listing columns: {missing}")
listings = (
pl.scan_parquet(listings_path)
.with_row_index("_listing_idx")
.with_columns(
_canonical_postcode_expr("Postcode").alias("_original_postcode"),
normalize_address_key(pl.col("Address per Property Register")).alias(
"_listing_match_address"
),
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
)
.collect(engine="streaming")
)
postcode_mapping = build_postcode_mapping(arcgis_path)
listings = (
listings.join(
postcode_mapping,
left_on="_original_postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias(
"Postcode"
),
)
.drop("new_postcode", strict=False)
.with_columns(
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
)
)
return listings
def _load_property_candidates(
properties_path: Path, listing_postcodes: list[str]
) -> pl.DataFrame:
schema = pl.scan_parquet(properties_path).collect_schema()
columns = [
column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names()
]
missing = sorted(
set(
[
"Address per Property Register",
"Postcode",
"Property type",
"Total floor area (sqm)",
]
)
- set(columns)
)
if missing:
raise ValueError(f"{properties_path} is missing property columns: {missing}")
return (
pl.scan_parquet(properties_path)
.select(columns)
.with_columns(
normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode")
)
.filter(pl.col("_match_postcode").is_in(listing_postcodes))
.with_columns(
normalize_address_key(pl.col("Address per Property Register")).alias(
"_match_register_address"
),
normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address")
if "Address per EPC" in columns
else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"),
)
.filter(
pl.col("_match_register_address").is_not_null()
| pl.col("_match_epc_address").is_not_null()
)
.with_row_index("_property_row")
.collect(engine="streaming")
)
def _property_candidates_by_postcode(
candidates: pl.DataFrame,
) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _numbers_compatible(left: str | None, right: str | None) -> bool:
if not left or not right:
return False
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
(left_nums, right_nums)
if len(left_nums) <= len(right_nums)
else (right_nums, left_nums)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _has_number(address: str | None) -> bool:
return bool(address and _NUMBER_RE.search(address))
def _ratio_bonus(
left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
if left is None or right is None:
return 0.0
try:
left_f = float(left)
right_f = float(right)
except (TypeError, ValueError):
return 0.0
if left_f <= 0 or right_f <= 0:
return 0.0
rel = abs(left_f - right_f) / max(left_f, right_f)
if rel > pct:
return 0.0
return cap * (1.0 - rel / pct)
def _rooms_bonus(left: int | None, right: int | None) -> float:
if left is None or right is None:
return 0.0
try:
diff = abs(int(left) - int(right))
except (TypeError, ValueError):
return 0.0
if diff == 0:
return 4.0
if diff == 1:
return 2.0
return 0.0
def _enum_bonus(
left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
if not left or not right:
return 0.0
return exact if left == right else mismatch
def _address_score(query: str, candidate: str | None) -> int:
if not candidate:
return 0
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict, str]] = []
for candidate in candidates:
register_address = candidate.get("_match_register_address")
epc_address = candidate.get("_match_epc_address")
if listing_has_numbers and not (
_numbers_compatible(query, register_address)
or _numbers_compatible(query, epc_address)
):
continue
register_score = _address_score(query, register_address)
epc_score = _address_score(query, epc_address)
base_score = max(register_score, epc_score)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("Property type"),
candidate.get("Property type"),
exact=7.0,
mismatch=-8.0,
)
score += _enum_bonus(
listing.get("Leasehold/Freehold"),
candidate.get("Leasehold/Freehold"),
exact=3.0,
mismatch=-3.0,
)
score += _ratio_bonus(
listing.get("Total floor area (sqm)"),
candidate.get("Total floor area (sqm)"),
pct=0.15,
cap=8.0,
)
score += _rooms_bonus(
listing.get("Number of bedrooms & living rooms"),
candidate.get("Number of bedrooms & living rooms"),
)
score += _ratio_bonus(
listing.get("Asking price"),
candidate.get("Estimated current price")
or candidate.get("Last known price"),
pct=0.25,
cap=3.0,
)
matched_address = (
"Address per Property Register"
if register_score >= epc_score
else "Address per EPC"
)
scored.append((score, base_score, candidate, matched_address))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN:
return None
return {
"_listing_idx": listing["_listing_idx"],
"_property_row": top[2]["_property_row"],
"Historical property match score": round(top[0], 1),
"Historical property address score": top[1],
"Historical property match margin": round(margin, 1),
"Historical property match field": top[3],
"Historical property match status": "matched",
}
def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
schema = {
"_listing_idx": pl.UInt32,
"_property_row": pl.UInt32,
"Historical property match score": pl.Float32,
"Historical property address score": pl.Int32,
"Historical property match margin": pl.Float32,
"Historical property match field": pl.Utf8,
"Historical property match status": pl.Utf8,
}
if candidates.is_empty():
return pl.DataFrame(schema=schema)
buckets = _property_candidates_by_postcode(candidates)
matches = []
for listing in tqdm(
listings.iter_rows(named=True),
total=listings.height,
desc="Matching historical properties",
):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_property_candidate(listing, buckets.get(postcode, []))
if match is not None:
matches.append(match)
if not matches:
return pl.DataFrame(schema=schema)
return pl.DataFrame(matches, schema=schema)
def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame:
rename = {column: f"{prefix}{column}" for column in columns if column in df.columns}
return df.rename(rename)
def _ensure_prefixed_columns(
df: pl.DataFrame, columns: list[str], prefix: str
) -> pl.DataFrame:
missing_exprs = [
pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias(
f"{prefix}{column}"
)
for column in columns
if f"{prefix}{column}" not in df.columns
]
if not missing_exprs:
return df
return df.with_columns(missing_exprs)
def _property_match_frame(
matches: pl.DataFrame, candidates: pl.DataFrame
) -> pl.DataFrame:
if matches.is_empty():
return matches
selected_columns = [
"_property_row",
*[
column
for column in PROPERTY_CANDIDATE_COLUMNS
if column in candidates.columns
],
]
matched = matches.join(
candidates.select(selected_columns), on="_property_row", how="left"
)
return _prefix_columns(
matched,
[column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns],
"_property_",
)
def _canonical_epc_property_type_expr() -> pl.Expr:
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
return (
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(None)
.replace(
{
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
"House": "Other",
}
)
)
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return (
pl.col(column)
.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
def _fractional_year_expr(column: str) -> pl.Expr:
return (
pl.col(column).dt.year().cast(pl.Float32)
+ (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0
)
def _load_epc_candidates(
epc_path: Path, listing_postcodes: list[str], temp_dir: Path
) -> pl.DataFrame:
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
epc = (
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
.sort("inspection_date", descending=True)
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.with_columns(
_canonical_epc_property_type_expr().alias("_epc_canonical_property_type"),
_construction_year_expr().alias("Construction year"),
pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES))
.then(pl.col("current_energy_rating"))
.otherwise(None)
.alias("Current energy rating"),
pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES))
.then(pl.col("potential_energy_rating"))
.otherwise(None)
.alias("Potential energy rating"),
pl.col("total_floor_area").alias("Total floor area (sqm)"),
pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"),
pl.col("floor_height").alias("Interior height (m)"),
pl.col("epc_address").alias("Address per EPC"),
)
.drop("tenure", strict=False)
)
social_tenure = (
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("_epc_match_address", "_epc_match_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("Former council house"))
)
return (
epc.join(
social_tenure,
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.with_columns(pl.col("Former council house").fill_null("No"))
.filter(pl.col("_epc_match_address").is_not_null())
.with_row_index("_epc_row")
.select(
"_epc_row",
"_epc_match_address",
"_epc_match_postcode",
"_epc_canonical_property_type",
*EPC_ENRICHMENT_COLUMNS,
)
.collect(engine="streaming")
)
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_epc_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict]] = []
for candidate in candidates:
address = candidate.get("_epc_match_address")
if listing_has_numbers and not _numbers_compatible(query, address):
continue
base_score = _address_score(query, address)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("Property type"),
candidate.get("_epc_canonical_property_type"),
exact=6.0,
mismatch=-6.0,
)
score += _ratio_bonus(
listing.get("Total floor area (sqm)"),
candidate.get("Total floor area (sqm)"),
pct=0.12,
cap=8.0,
)
score += _rooms_bonus(
listing.get("Number of bedrooms & living rooms"),
candidate.get("Number of bedrooms & living rooms"),
)
scored.append((score, base_score, candidate))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
EPC_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN:
return None
return {
"_listing_idx": listing["_listing_idx"],
"_epc_row": top[2]["_epc_row"],
"EPC match score": round(top[0], 1),
"EPC address score": top[1],
"EPC match margin": round(margin, 1),
"EPC match status": "matched",
}
def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
schema = {
"_listing_idx": pl.UInt32,
"_epc_row": pl.UInt32,
"EPC match score": pl.Float32,
"EPC address score": pl.Int32,
"EPC match margin": pl.Float32,
"EPC match status": pl.Utf8,
}
if candidates.is_empty():
return pl.DataFrame(schema=schema)
buckets = _epc_candidates_by_postcode(candidates)
matches = []
for listing in tqdm(
listings.iter_rows(named=True),
total=listings.height,
desc="Matching EPC certificates",
):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_epc_candidate(listing, buckets.get(postcode, []))
if match is not None:
matches.append(match)
if not matches:
return pl.DataFrame(schema=schema)
return pl.DataFrame(matches, schema=schema)
def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
if matches.is_empty():
return matches
matched = matches.join(
candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS),
on="_epc_row",
how="left",
)
return _prefix_columns(
matched,
[column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns],
"_epc_",
)
def _join_postcode_features(
listings: pl.DataFrame, postcode_features_path: Path
) -> pl.DataFrame:
postcode_features = pl.scan_parquet(postcode_features_path).collect(
engine="streaming"
)
return listings.join(
postcode_features, on="Postcode", how="left", suffix="_postcode"
)
def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame:
with_columns: list[pl.Expr] = [
pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias(
"Actual listing enrichment version"
),
_coalesce_non_empty(
"_epc_Address per EPC",
"_property_Address per EPC",
).alias("Address per EPC"),
pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES))
.then(pl.col("Property type"))
.otherwise(pl.col("_property_Property type"))
.alias("Property type"),
pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES))
.then(pl.col("Leasehold/Freehold"))
.otherwise(pl.col("_property_Leasehold/Freehold"))
.alias("Leasehold/Freehold"),
pl.coalesce(
_valid_number_expr("Total floor area (sqm)"),
_valid_number_expr("_epc_Total floor area (sqm)"),
_valid_number_expr("_property_Total floor area (sqm)"),
).alias("Total floor area (sqm)"),
pl.when(pl.col("Number of bedrooms & living rooms") > 0)
.then(pl.col("Number of bedrooms & living rooms"))
.otherwise(
pl.coalesce(
pl.col("_epc_Number of bedrooms & living rooms"),
pl.col("_property_Number of bedrooms & living rooms"),
)
)
.cast(pl.Int32, strict=False)
.alias("Number of bedrooms & living rooms"),
pl.col("Asking price").alias("Estimated current price"),
pl.col("Asking price").alias("Last known price"),
_coalesce_non_empty(
"_epc_Current energy rating",
"_property_Current energy rating",
).alias("Current energy rating"),
_coalesce_non_empty(
"_epc_Potential energy rating",
"_property_Potential energy rating",
).alias("Potential energy rating"),
pl.coalesce(
_valid_number_expr("_epc_Interior height (m)"),
_valid_number_expr("_property_Interior height (m)"),
).alias("Interior height (m)"),
pl.coalesce(
pl.col("_epc_Construction year"),
pl.col("_property_Construction year"),
)
.cast(pl.UInt16, strict=False)
.alias("Construction year"),
_coalesce_non_empty(
"_epc_Former council house",
"_property_Former council house",
)
.fill_null("No")
.alias("Former council house"),
pl.col("_property_Is construction date approximate").alias(
"Is construction date approximate"
),
pl.col("_property_Listed building").fill_null("No").alias("Listed building"),
pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"),
pl.col("_property_Street tree density percentile").alias(
"Street tree density percentile"
),
_fractional_year_expr("_property_Date of last transaction").alias(
"Date of last transaction"
),
]
df = df.with_columns(with_columns)
df = df.with_columns(
pl.when(
pl.col("Asking price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
)
.then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0))
.otherwise(None)
.cast(pl.Int32, strict=False)
.alias("Asking price per sqm"),
).with_columns(
pl.col("Asking price per sqm").alias("Est. price per sqm"),
pl.col("Asking price per sqm").alias("Price per sqm"),
)
return df
def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame:
internal_prefixes = ("_property_", "_epc_")
internal_exact = {
"_listing_idx",
"_listing_match_address",
"_listing_match_postcode",
"_original_postcode",
"_property_row",
"_epc_row",
"lat_postcode",
"lon_postcode",
}
drop_cols = [
column
for column in df.columns
if column in internal_exact or column.startswith(internal_prefixes)
]
return df.drop(drop_cols, strict=False)
def build_enriched_actual_listings(
listings_path: Path,
properties_path: Path,
postcode_features_path: Path,
arcgis_path: Path,
output_path: Path,
*,
epc_path: Path | None = None,
) -> pl.DataFrame:
print(f"Loading listings from {listings_path}...")
listings = _read_listings(listings_path, arcgis_path)
listing_postcodes = (
listings.select("_listing_match_postcode")
.drop_nulls()
.unique()
.to_series()
.to_list()
)
print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}")
print(f"Loading property candidates from {properties_path}...")
property_candidates = _load_property_candidates(properties_path, listing_postcodes)
print(f"Property candidates: {property_candidates.height}")
property_matches = _match_properties(listings, property_candidates)
print(f"Historical property matches: {property_matches.height}")
property_match_frame = _property_match_frame(property_matches, property_candidates)
enriched = _join_postcode_features(listings, postcode_features_path)
if not property_match_frame.is_empty():
enriched = enriched.join(property_match_frame, on="_listing_idx", how="left")
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("Historical property match status")
)
if epc_path is not None:
with tempfile.TemporaryDirectory(
prefix="actual_listing_epc_", dir=local_tmp_dir()
) as tmpdir:
print(f"Loading EPC candidates from {epc_path}...")
epc_candidates = _load_epc_candidates(
epc_path, listing_postcodes, Path(tmpdir)
)
print(f"EPC candidates: {epc_candidates.height}")
epc_matches = _match_epc(listings, epc_candidates)
print(f"EPC matches: {epc_matches.height}")
epc_match_frame = _epc_match_frame(epc_matches, epc_candidates)
if not epc_match_frame.is_empty():
enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left")
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
)
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
)
enriched = _ensure_prefixed_columns(
enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_"
)
enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_")
enriched = _coalesce_feature_columns(enriched)
enriched = _drop_internal_columns(enriched)
output_path.parent.mkdir(parents=True, exist_ok=True)
enriched.write_parquet(output_path)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(
f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)"
)
return enriched
def main() -> None:
parser = argparse.ArgumentParser(
description="Build a pre-enriched actual-listings parquet for the server"
)
parser.add_argument(
"--listings",
type=Path,
default=Path("finder/data/online_listings_buy.parquet"),
help="Input scraped listings parquet",
)
parser.add_argument(
"--properties",
type=Path,
default=Path("property-data/properties.parquet"),
help="Historical properties parquet",
)
parser.add_argument(
"--postcode-features",
type=Path,
default=Path("property-data/postcode.parquet"),
help="Postcode feature parquet",
)
parser.add_argument(
"--arcgis",
type=Path,
default=Path("property-data/arcgis_data.parquet"),
help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping",
)
parser.add_argument(
"--epc",
type=Path,
default=Path("manual-data/domestic-csv.zip"),
help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching",
)
parser.add_argument(
"--no-epc",
action="store_true",
help="Skip direct EPC matching even when --epc exists",
)
parser.add_argument(
"--output",
type=Path,
default=Path("finder/data/online_listings_buy_enriched.parquet"),
help="Output enriched listings parquet",
)
args = parser.parse_args()
epc_path = None if args.no_epc else args.epc
if epc_path is not None and not epc_path.exists():
print(
f"EPC source not found at {epc_path}; continuing without direct EPC matching"
)
epc_path = None
build_enriched_actual_listings(
listings_path=args.listings,
properties_path=args.properties,
postcode_features_path=args.postcode_features,
arcgis_path=args.arcgis,
epc_path=epc_path,
output_path=args.output,
)
if __name__ == "__main__":
main()

View file

@ -22,6 +22,7 @@ LISTED_BUILDING_FEATURE = "Listed building"
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
LISTED_BUILDING_NEAREST_POSTCODES = 3
LISTED_BUILDING_MIN_MATCH_SCORE = 95
_UNPUBLISHED_CONSERVATION_AREA_PREFIX = "no data available for publication"
_IOD_PERCENTILE_COLUMNS = [
"Education, Skills and Training Score",
@ -429,19 +430,38 @@ def _normalise_crs(crs: object | None) -> str:
return str(crs) if crs else "EPSG:4326"
def _is_unpublished_conservation_area_record(name: object) -> bool:
return (
isinstance(name, str)
and name.strip().casefold().startswith(_UNPUBLISHED_CONSERVATION_AREA_PREFIX)
)
def _load_conservation_area_geometries(
conservation_areas_path: Path,
) -> tuple[list[BaseGeometry], str]:
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=["NAME"])
geometry_name = metadata.get("geometry_name") or table.column_names[-1]
names = table["NAME"].combine_chunks().to_pylist()
geometries = []
for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
if geom is not None and not geom.is_empty:
skipped_unpublished = 0
for name, geom in zip(
names, from_wkb(table[geometry_name].combine_chunks().to_pylist()), strict=True
):
if _is_unpublished_conservation_area_record(name):
skipped_unpublished += 1
elif geom is not None and not geom.is_empty:
geometries.append(geom)
if not geometries:
raise ValueError(
f"{conservation_areas_path} does not contain any usable polygon geometries"
)
if skipped_unpublished:
print(
"Skipped "
f"{skipped_unpublished} Historic England unpublished conservation-area "
"placeholder polygons"
)
return geometries, _normalise_crs(metadata.get("crs"))

View file

@ -25,7 +25,7 @@ GREENSPACE_PARK_FUNCTIONS = {
}
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure", "Health"}
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"}

View file

@ -0,0 +1,143 @@
from pathlib import Path
import polars as pl
from pipeline.transform.enrich_actual_listings import build_enriched_actual_listings
def test_build_enriched_actual_listings_joins_postcode_and_property_features(
tmp_path: Path,
) -> None:
listings_path = tmp_path / "listings.parquet"
properties_path = tmp_path / "properties.parquet"
postcode_path = tmp_path / "postcode.parquet"
arcgis_path = tmp_path / "arcgis.parquet"
output_path = tmp_path / "online_listings_buy_enriched.parquet"
pl.DataFrame(
{
"Bedrooms": [2],
"Bathrooms": [1],
"Number of bedrooms & living rooms": [3],
"lon": [-0.1],
"lat": [51.5],
"Postcode": ["AA1 1AB"],
"Address per Property Register": ["1 High Street"],
"Leasehold/Freehold": [None],
"Property type": ["Terraced"],
"Property sub-type": ["Terraced"],
"Price qualifier": [""],
"Total floor area (sqm)": [None],
"Listing URL": ["https://example.test/listing"],
"Listing features": [["Garden"]],
"Listing date": [None],
"Listing status": ["For sale"],
"Asking price": [300_000],
"Asking price per sqm": [None],
},
schema={
"Bedrooms": pl.Int32,
"Bathrooms": pl.Int32,
"Number of bedrooms & living rooms": pl.Int32,
"lon": pl.Float64,
"lat": pl.Float64,
"Postcode": pl.Utf8,
"Address per Property Register": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Property type": pl.Utf8,
"Property sub-type": pl.Utf8,
"Price qualifier": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Listing URL": pl.Utf8,
"Listing features": pl.List(pl.Utf8),
"Listing date": pl.Datetime("us"),
"Listing status": pl.Utf8,
"Asking price": pl.Int64,
"Asking price per sqm": pl.Int32,
},
).write_parquet(listings_path)
pl.DataFrame(
{
"Address per Property Register": ["1 HIGH STREET"],
"Postcode": ["AA1 1AA"],
"Leasehold/Freehold": ["Freehold"],
"Address per EPC": ["1 High Street"],
"Current energy rating": ["C"],
"Potential energy rating": ["B"],
"Total floor area (sqm)": [80.0],
"Number of bedrooms & living rooms": [4],
"Interior height (m)": [2.4],
"Construction year": [1935],
"Former council house": ["No"],
"Listed building": ["No"],
"Estimated monthly rent": [1200.0],
"Street tree density percentile": [75.0],
"Property type": ["Terraced"],
"Estimated current price": [310_000.0],
},
schema={
"Address per Property Register": pl.Utf8,
"Postcode": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
"Address per EPC": pl.Utf8,
"Current energy rating": pl.Utf8,
"Potential energy rating": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int32,
"Interior height (m)": pl.Float64,
"Construction year": pl.UInt16,
"Former council house": pl.Utf8,
"Listed building": pl.Utf8,
"Estimated monthly rent": pl.Float32,
"Street tree density percentile": pl.Float32,
"Property type": pl.Utf8,
"Estimated current price": pl.Float64,
},
).write_parquet(properties_path)
pl.DataFrame(
{
"Postcode": ["AA1 1AA"],
"Income Score": [82.5],
"Within conservation area": ["Yes"],
}
).write_parquet(postcode_path)
pl.DataFrame(
{
"pcds": ["AA1 1AA", "AA1 1AB"],
"ctry25cd": ["E92000001", "E92000001"],
"doterm": [None, "202401"],
"east1m": [100.0, 105.0],
"north1m": [100.0, 105.0],
},
schema={
"pcds": pl.Utf8,
"ctry25cd": pl.Utf8,
"doterm": pl.Utf8,
"east1m": pl.Float64,
"north1m": pl.Float64,
},
).write_parquet(arcgis_path)
result = build_enriched_actual_listings(
listings_path=listings_path,
properties_path=properties_path,
postcode_features_path=postcode_path,
arcgis_path=arcgis_path,
output_path=output_path,
epc_path=None,
)
row = result.row(0, named=True)
assert output_path.exists()
assert row["Postcode"] == "AA1 1AA"
assert row["Historical property match status"] == "matched"
assert row["Income Score"] == 82.5
assert row["Within conservation area"] == "Yes"
assert row["Leasehold/Freehold"] == "Freehold"
assert row["Total floor area (sqm)"] == 80.0
assert row["Asking price per sqm"] == 3750
assert row["Estimated current price"] == 300_000
assert row["Current energy rating"] == "C"

View file

@ -1,14 +1,17 @@
import polars as pl
import pyarrow as pa
import pytest
from shapely import box
from shapely import box, to_wkb
from pipeline.transform.merge import (
_AREA_COLUMNS,
CONSERVATION_AREA_FEATURE,
LISTED_BUILDING_FEATURE,
TREE_DENSITY_FEATURE,
_is_unpublished_conservation_area_record,
_is_dynamic_poi_metric_column,
_less_deprived_percentile_expr,
_load_conservation_area_geometries,
_matched_listed_building_flags,
_postcode_conservation_area_flags,
_postcode_listed_building_candidates,
@ -82,6 +85,45 @@ def test_postcode_conservation_area_flags_marks_point_membership() -> None:
]
def test_unpublished_conservation_area_records_are_identified() -> None:
assert _is_unpublished_conservation_area_record(
"No data available for publication by HE"
)
assert not _is_unpublished_conservation_area_record("Bloomsbury")
assert not _is_unpublished_conservation_area_record(None)
def test_load_conservation_area_geometries_skips_unpublished_placeholders(
monkeypatch: pytest.MonkeyPatch,
tmp_path,
) -> None:
real_area = box(0, 0, 1, 1)
placeholder_area = box(-100, -100, 100, 100)
def fake_read_arrow(path, columns):
assert path == tmp_path / "conservation_areas.gpkg"
assert columns == ["NAME"]
table = pa.table(
{
"NAME": [
"Central Village",
"No data available for publication by HE",
],
"SHAPE": to_wkb([real_area, placeholder_area]),
}
)
return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
geometries, crs = _load_conservation_area_geometries(
tmp_path / "conservation_areas.gpkg"
)
assert crs == "EPSG:4326"
assert geometries == [real_area]
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
listed_points = pl.DataFrame(
{

View file

@ -37,6 +37,7 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
assert set(display_names.values()) == {
"Bus stop",
"Café",
"Pharmacy",
"Rail station",
"Restaurant",
"Tesco",
@ -44,7 +45,6 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
assert "poi_waitrose" not in groups
assert "poi_park" not in groups
assert "poi_school" not in groups
assert "poi_pharmacy" not in groups
def test_dynamic_poi_metric_renames_support_park_count_options() -> None:

View file

@ -1316,17 +1316,122 @@ def transform_grocery_retail_points(
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata."""
return pl.scan_parquet(gias_path).select(
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
"Nursery school": "🧸",
"Primary school": "🎒",
"Secondary school": "🏫",
"All-through school": "🏫",
"Sixth form": "📚",
"Further education college": "📚",
"University": "🎓",
"Special school": "🤝",
"School": "🏫",
}
def _school_icon_category_expr() -> pl.Expr:
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
wins for universities, FE colleges and special schools (which span multiple
phases); otherwise phase determines the bucket. For independent and other
non-statutory schools where GIAS leaves phase null, fall back to the
age_range bounds so they still split into the right pill."""
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
# primary") so we normalise before matching.
phase = pl.col("phase").str.to_lowercase()
# age_range is "<min><max>" using an em-dash; both ends may be missing.
age_parts = pl.col("age_range").str.split_exact("", 1)
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
return (
pl.when(pl.col("type_group") == "Universities")
.then(pl.lit("University"))
.when(pl.col("type_group") == "Special schools")
.then(pl.lit("Special school"))
.when(pl.col("type_group") == "Colleges")
.then(pl.lit("Further education college"))
.when(phase == "nursery")
.then(pl.lit("Nursery school"))
.when(phase.is_in(["primary", "middle deemed primary"]))
.then(pl.lit("Primary school"))
.when(phase.is_in(["secondary", "middle deemed secondary"]))
.then(pl.lit("Secondary school"))
.when(phase == "all-through")
.then(pl.lit("All-through school"))
.when(phase.is_in(["16 plus", "sixth form"]))
.then(pl.lit("Sixth form"))
# Age-range fallback for null-phase rows (≈3k Independents + Academies
# GIAS doesn't classify by phase).
.when(max_age <= 5)
.then(pl.lit("Nursery school"))
.when(min_age >= 16)
.then(pl.lit("Sixth form"))
.when((min_age <= 6) & (max_age >= 16))
.then(pl.lit("All-through school"))
.when(max_age <= 11)
.then(pl.lit("Primary school"))
.when(min_age >= 10)
.then(pl.lit("Secondary school"))
.otherwise(pl.lit("School"))
)
OFSTED_OEIF_LABELS = {
"1": "Outstanding",
"2": "Good",
"3": "Requires improvement",
"4": "Inadequate",
}
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
"""Project the latest OEIF effectiveness grade to a human-readable label,
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
only have a report card) is preserved verbatim; null grades drop out."""
grade_col = pl.col("Latest OEIF overall effectiveness")
label = (
pl.when(grade_col == "1")
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
.when(grade_col == "2")
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
.when(grade_col == "3")
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
.when(grade_col == "4")
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
.when(grade_col == "Not judged")
.then(pl.lit("Not judged"))
.otherwise(None)
)
return (
pl.scan_parquet(ofsted_path)
.select(
pl.col("URN").cast(pl.Int64).alias("urn"),
label.alias("ofsted_rating"),
)
.filter(pl.col("ofsted_rating").is_not_null())
)
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
"""Convert the GIAS register parquet into POI rows with school metadata.
Ofsted ratings are joined by URN so each school carries its latest OEIF
overall effectiveness grade (Outstanding/Good/Requires improvement/
Inadequate/Not judged), surfaced in the map popup."""
icon_category_expr = _school_icon_category_expr()
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
ofsted = _load_ofsted_ratings(ofsted_path)
# category mirrors icon_category so the dashboard renders one toggle per
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
# instead of bundling every GIAS row under a single "School" pill.
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
pl.col("name"),
pl.lit("School").alias("category"),
pl.lit("School").alias("icon_category"),
icon_category_expr.alias("category"),
icon_category_expr.alias("icon_category"),
pl.lit("Education").alias("group"),
pl.col("lat").cast(pl.Float64),
pl.col("lng").cast(pl.Float64),
pl.lit("🏫").alias("emoji"),
emoji_expr.alias("emoji"),
pl.col("phase").alias("school_phase"),
pl.col("type").alias("school_type"),
pl.col("type_group").alias("school_type_group"),
@ -1346,6 +1451,7 @@ def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
pl.col("website").alias("school_website"),
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
pl.col("head_name").alias("school_head_name"),
pl.col("ofsted_rating").alias("school_ofsted_rating"),
)
@ -1355,6 +1461,7 @@ def transform(
boundary_path: Path,
grocery_retail_points_path: Path,
gias_path: Path,
ofsted_path: Path,
) -> pl.LazyFrame:
lf = pl.scan_parquet(input_path)
@ -1420,7 +1527,12 @@ def transform(
grocery_df = pl.read_parquet(grocery_retail_points_path)
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
frames = [
lf,
naptan,
grocery_pois.lazy(),
transform_gias_schools(gias_path, ofsted_path),
]
return pl.concat(frames, how="diagonal_relaxed")
@ -1453,6 +1565,12 @@ def main():
required=True,
help="GIAS schools register parquet (replaces OSM schools)",
)
parser.add_argument(
"--ofsted",
type=Path,
required=True,
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
)
parser.add_argument(
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
)
@ -1464,6 +1582,7 @@ def main():
args.boundary,
args.grocery_retail_points,
args.gias,
args.ofsted,
).collect(engine="streaming")
df.write_parquet(args.output)

View file

@ -219,6 +219,8 @@ def build_tree_overlay_tiles(
str(max_zoom),
"--drop-smallest-as-needed",
"--extend-zooms-if-still-dropping",
"--temporary-directory",
tmp,
str(ndjson_path),
],
check=True,