alright
This commit is contained in:
parent
c645b0f1d4
commit
39ef5c6646
79 changed files with 5660 additions and 2199 deletions
|
|
@ -10,7 +10,11 @@ from pathlib import Path
|
|||
|
||||
from PIL import Image, ImageDraw
|
||||
|
||||
from pipeline.transform.transform_poi import NAPTAN_EMOJIS, _CATEGORIES
|
||||
from pipeline.transform.transform_poi import (
|
||||
NAPTAN_EMOJIS,
|
||||
SCHOOL_ICON_CATEGORIES,
|
||||
_CATEGORIES,
|
||||
)
|
||||
|
||||
GLYPHS_BASE = "https://protomaps.github.io/basemaps-assets/fonts"
|
||||
SPRITES_BASE = "https://protomaps.github.io/basemaps-assets/sprites/v4"
|
||||
|
|
@ -109,6 +113,9 @@ def collect_twemoji_codes() -> list[str]:
|
|||
for emoji in NAPTAN_EMOJIS.values():
|
||||
emojis.add(emoji)
|
||||
|
||||
for emoji in SCHOOL_ICON_CATEGORIES.values():
|
||||
emojis.add(emoji)
|
||||
|
||||
# First codepoint hex, matching frontend logic
|
||||
return sorted({f"{ord(e[0]):x}" for e in emojis})
|
||||
|
||||
|
|
|
|||
|
|
@ -124,6 +124,8 @@ def build_crime_hotspot_tiles(
|
|||
str(max_zoom),
|
||||
"--drop-densest-as-needed",
|
||||
"--extend-zooms-if-still-dropping",
|
||||
"--temporary-directory",
|
||||
tmp,
|
||||
str(ndjson_path),
|
||||
],
|
||||
check=True,
|
||||
|
|
|
|||
960
pipeline/transform/enrich_actual_listings.py
Normal file
960
pipeline/transform/enrich_actual_listings.py
Normal file
|
|
@ -0,0 +1,960 @@
|
|||
import argparse
|
||||
import re
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
from thefuzz import fuzz
|
||||
from tqdm import tqdm
|
||||
|
||||
from pipeline.local_temp import local_tmp_dir
|
||||
from pipeline.transform.join_epc_pp import _scan_epc_certificates
|
||||
from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key
|
||||
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
||||
|
||||
MIN_FLOOR_AREA_M2 = 10.0
|
||||
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
||||
PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
||||
PROPERTY_MATCH_MIN_MARGIN = 4.0
|
||||
EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
||||
EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
||||
EPC_MATCH_MIN_MARGIN = 4.0
|
||||
ENRICHMENT_VERSION = 1
|
||||
|
||||
_NUMBER_RE = re.compile(r"\d+")
|
||||
|
||||
LISTING_REQUIRED_COLUMNS = [
|
||||
"Bedrooms",
|
||||
"Bathrooms",
|
||||
"Number of bedrooms & living rooms",
|
||||
"lon",
|
||||
"lat",
|
||||
"Postcode",
|
||||
"Address per Property Register",
|
||||
"Leasehold/Freehold",
|
||||
"Property type",
|
||||
"Property sub-type",
|
||||
"Price qualifier",
|
||||
"Total floor area (sqm)",
|
||||
"Listing URL",
|
||||
"Listing features",
|
||||
"Listing date",
|
||||
"Listing status",
|
||||
"Asking price",
|
||||
"Asking price per sqm",
|
||||
]
|
||||
|
||||
PROPERTY_CANDIDATE_COLUMNS = [
|
||||
"Address per Property Register",
|
||||
"Postcode",
|
||||
"Leasehold/Freehold",
|
||||
"Last known price",
|
||||
"Date of last transaction",
|
||||
"Address per EPC",
|
||||
"Current energy rating",
|
||||
"Potential energy rating",
|
||||
"Total floor area (sqm)",
|
||||
"Number of bedrooms & living rooms",
|
||||
"Interior height (m)",
|
||||
"Construction year",
|
||||
"Former council house",
|
||||
"Is construction date approximate",
|
||||
"Listed building",
|
||||
"Estimated monthly rent",
|
||||
"Street tree density percentile",
|
||||
"Property type",
|
||||
"Price per sqm",
|
||||
"Estimated current price",
|
||||
"Est. price per sqm",
|
||||
]
|
||||
|
||||
PROPERTY_ENRICHMENT_COLUMNS = [
|
||||
"Address per EPC",
|
||||
"Current energy rating",
|
||||
"Potential energy rating",
|
||||
"Interior height (m)",
|
||||
"Construction year",
|
||||
"Former council house",
|
||||
"Is construction date approximate",
|
||||
"Listed building",
|
||||
"Estimated monthly rent",
|
||||
"Street tree density percentile",
|
||||
"Date of last transaction",
|
||||
]
|
||||
|
||||
EPC_ENRICHMENT_COLUMNS = [
|
||||
"Address per EPC",
|
||||
"Current energy rating",
|
||||
"Potential energy rating",
|
||||
"Total floor area (sqm)",
|
||||
"Number of bedrooms & living rooms",
|
||||
"Interior height (m)",
|
||||
"Construction year",
|
||||
"Former council house",
|
||||
]
|
||||
|
||||
EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
|
||||
TENURE_VALUES = ["Freehold", "Leasehold"]
|
||||
PROPERTY_TYPE_VALUES = [
|
||||
"Detached",
|
||||
"Semi-Detached",
|
||||
"Terraced",
|
||||
"Flats/Maisonettes",
|
||||
"Other",
|
||||
]
|
||||
|
||||
COLUMN_DTYPES = {
|
||||
"Address per EPC": pl.Utf8,
|
||||
"Current energy rating": pl.Utf8,
|
||||
"Potential energy rating": pl.Utf8,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Number of bedrooms & living rooms": pl.Int32,
|
||||
"Interior height (m)": pl.Float64,
|
||||
"Construction year": pl.UInt16,
|
||||
"Former council house": pl.Utf8,
|
||||
"Is construction date approximate": pl.UInt8,
|
||||
"Listed building": pl.Utf8,
|
||||
"Estimated monthly rent": pl.Float32,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"Date of last transaction": pl.Datetime("us"),
|
||||
"Property type": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
}
|
||||
|
||||
|
||||
def _canonical_postcode_expr(column: str) -> pl.Expr:
|
||||
compact = (
|
||||
pl.col(column)
|
||||
.cast(pl.Utf8)
|
||||
.str.to_uppercase()
|
||||
.str.replace_all(r"[^A-Z0-9]+", "")
|
||||
.str.strip_chars()
|
||||
)
|
||||
return (
|
||||
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
|
||||
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
|
||||
.otherwise(None)
|
||||
)
|
||||
|
||||
|
||||
def _clean_string_expr(column: str) -> pl.Expr:
|
||||
stripped = pl.col(column).cast(pl.Utf8).str.strip_chars()
|
||||
return pl.when(stripped == "").then(None).otherwise(stripped)
|
||||
|
||||
|
||||
def _coalesce_non_empty(*columns: str) -> pl.Expr:
|
||||
return pl.coalesce(
|
||||
[
|
||||
pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "")
|
||||
.then(None)
|
||||
.otherwise(pl.col(column).cast(pl.Utf8))
|
||||
for column in columns
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def _valid_number_expr(column: str) -> pl.Expr:
|
||||
return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None)
|
||||
|
||||
|
||||
def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
|
||||
schema = pl.scan_parquet(listings_path).collect_schema()
|
||||
missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names()))
|
||||
if missing:
|
||||
raise ValueError(f"{listings_path} is missing listing columns: {missing}")
|
||||
|
||||
listings = (
|
||||
pl.scan_parquet(listings_path)
|
||||
.with_row_index("_listing_idx")
|
||||
.with_columns(
|
||||
_canonical_postcode_expr("Postcode").alias("_original_postcode"),
|
||||
normalize_address_key(pl.col("Address per Property Register")).alias(
|
||||
"_listing_match_address"
|
||||
),
|
||||
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path)
|
||||
listings = (
|
||||
listings.join(
|
||||
postcode_mapping,
|
||||
left_on="_original_postcode",
|
||||
right_on="old_postcode",
|
||||
how="left",
|
||||
)
|
||||
.with_columns(
|
||||
pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias(
|
||||
"Postcode"
|
||||
),
|
||||
)
|
||||
.drop("new_postcode", strict=False)
|
||||
.with_columns(
|
||||
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
|
||||
)
|
||||
)
|
||||
return listings
|
||||
|
||||
|
||||
def _load_property_candidates(
|
||||
properties_path: Path, listing_postcodes: list[str]
|
||||
) -> pl.DataFrame:
|
||||
schema = pl.scan_parquet(properties_path).collect_schema()
|
||||
columns = [
|
||||
column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names()
|
||||
]
|
||||
missing = sorted(
|
||||
set(
|
||||
[
|
||||
"Address per Property Register",
|
||||
"Postcode",
|
||||
"Property type",
|
||||
"Total floor area (sqm)",
|
||||
]
|
||||
)
|
||||
- set(columns)
|
||||
)
|
||||
if missing:
|
||||
raise ValueError(f"{properties_path} is missing property columns: {missing}")
|
||||
|
||||
return (
|
||||
pl.scan_parquet(properties_path)
|
||||
.select(columns)
|
||||
.with_columns(
|
||||
normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode")
|
||||
)
|
||||
.filter(pl.col("_match_postcode").is_in(listing_postcodes))
|
||||
.with_columns(
|
||||
normalize_address_key(pl.col("Address per Property Register")).alias(
|
||||
"_match_register_address"
|
||||
),
|
||||
normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address")
|
||||
if "Address per EPC" in columns
|
||||
else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"),
|
||||
)
|
||||
.filter(
|
||||
pl.col("_match_register_address").is_not_null()
|
||||
| pl.col("_match_epc_address").is_not_null()
|
||||
)
|
||||
.with_row_index("_property_row")
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
|
||||
def _property_candidates_by_postcode(
|
||||
candidates: pl.DataFrame,
|
||||
) -> dict[str, list[dict]]:
|
||||
buckets: dict[str, list[dict]] = {}
|
||||
for row in candidates.iter_rows(named=True):
|
||||
postcode = row.get("_match_postcode")
|
||||
if postcode:
|
||||
buckets.setdefault(postcode, []).append(row)
|
||||
return buckets
|
||||
|
||||
|
||||
def _numbers_compatible(left: str | None, right: str | None) -> bool:
|
||||
if not left or not right:
|
||||
return False
|
||||
left_nums = set(_NUMBER_RE.findall(left))
|
||||
right_nums = set(_NUMBER_RE.findall(right))
|
||||
smaller, larger = (
|
||||
(left_nums, right_nums)
|
||||
if len(left_nums) <= len(right_nums)
|
||||
else (right_nums, left_nums)
|
||||
)
|
||||
if not smaller and larger:
|
||||
return False
|
||||
return smaller.issubset(larger)
|
||||
|
||||
|
||||
def _has_number(address: str | None) -> bool:
|
||||
return bool(address and _NUMBER_RE.search(address))
|
||||
|
||||
|
||||
def _ratio_bonus(
|
||||
left: float | int | None, right: float | int | None, pct: float, cap: float
|
||||
) -> float:
|
||||
if left is None or right is None:
|
||||
return 0.0
|
||||
try:
|
||||
left_f = float(left)
|
||||
right_f = float(right)
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
if left_f <= 0 or right_f <= 0:
|
||||
return 0.0
|
||||
rel = abs(left_f - right_f) / max(left_f, right_f)
|
||||
if rel > pct:
|
||||
return 0.0
|
||||
return cap * (1.0 - rel / pct)
|
||||
|
||||
|
||||
def _rooms_bonus(left: int | None, right: int | None) -> float:
|
||||
if left is None or right is None:
|
||||
return 0.0
|
||||
try:
|
||||
diff = abs(int(left) - int(right))
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
if diff == 0:
|
||||
return 4.0
|
||||
if diff == 1:
|
||||
return 2.0
|
||||
return 0.0
|
||||
|
||||
|
||||
def _enum_bonus(
|
||||
left: str | None, right: str | None, *, exact: float, mismatch: float
|
||||
) -> float:
|
||||
if not left or not right:
|
||||
return 0.0
|
||||
return exact if left == right else mismatch
|
||||
|
||||
|
||||
def _address_score(query: str, candidate: str | None) -> int:
|
||||
if not candidate:
|
||||
return 0
|
||||
return max(
|
||||
fuzz.token_set_ratio(query, candidate),
|
||||
fuzz.token_sort_ratio(query, candidate),
|
||||
)
|
||||
|
||||
|
||||
def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
||||
query = listing.get("_listing_match_address")
|
||||
if not query:
|
||||
return None
|
||||
|
||||
listing_has_numbers = _has_number(query)
|
||||
scored: list[tuple[float, int, dict, str]] = []
|
||||
for candidate in candidates:
|
||||
register_address = candidate.get("_match_register_address")
|
||||
epc_address = candidate.get("_match_epc_address")
|
||||
if listing_has_numbers and not (
|
||||
_numbers_compatible(query, register_address)
|
||||
or _numbers_compatible(query, epc_address)
|
||||
):
|
||||
continue
|
||||
|
||||
register_score = _address_score(query, register_address)
|
||||
epc_score = _address_score(query, epc_address)
|
||||
base_score = max(register_score, epc_score)
|
||||
if base_score == 0:
|
||||
continue
|
||||
|
||||
score = float(base_score)
|
||||
score += _enum_bonus(
|
||||
listing.get("Property type"),
|
||||
candidate.get("Property type"),
|
||||
exact=7.0,
|
||||
mismatch=-8.0,
|
||||
)
|
||||
score += _enum_bonus(
|
||||
listing.get("Leasehold/Freehold"),
|
||||
candidate.get("Leasehold/Freehold"),
|
||||
exact=3.0,
|
||||
mismatch=-3.0,
|
||||
)
|
||||
score += _ratio_bonus(
|
||||
listing.get("Total floor area (sqm)"),
|
||||
candidate.get("Total floor area (sqm)"),
|
||||
pct=0.15,
|
||||
cap=8.0,
|
||||
)
|
||||
score += _rooms_bonus(
|
||||
listing.get("Number of bedrooms & living rooms"),
|
||||
candidate.get("Number of bedrooms & living rooms"),
|
||||
)
|
||||
score += _ratio_bonus(
|
||||
listing.get("Asking price"),
|
||||
candidate.get("Estimated current price")
|
||||
or candidate.get("Last known price"),
|
||||
pct=0.25,
|
||||
cap=3.0,
|
||||
)
|
||||
matched_address = (
|
||||
"Address per Property Register"
|
||||
if register_score >= epc_score
|
||||
else "Address per EPC"
|
||||
)
|
||||
scored.append((score, base_score, candidate, matched_address))
|
||||
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[0], reverse=True)
|
||||
top = scored[0]
|
||||
runner_up = scored[1][0] if len(scored) > 1 else None
|
||||
margin = top[0] - runner_up if runner_up is not None else top[0]
|
||||
threshold = (
|
||||
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
|
||||
if listing_has_numbers
|
||||
else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||
)
|
||||
if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN:
|
||||
return None
|
||||
|
||||
return {
|
||||
"_listing_idx": listing["_listing_idx"],
|
||||
"_property_row": top[2]["_property_row"],
|
||||
"Historical property match score": round(top[0], 1),
|
||||
"Historical property address score": top[1],
|
||||
"Historical property match margin": round(margin, 1),
|
||||
"Historical property match field": top[3],
|
||||
"Historical property match status": "matched",
|
||||
}
|
||||
|
||||
|
||||
def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
|
||||
schema = {
|
||||
"_listing_idx": pl.UInt32,
|
||||
"_property_row": pl.UInt32,
|
||||
"Historical property match score": pl.Float32,
|
||||
"Historical property address score": pl.Int32,
|
||||
"Historical property match margin": pl.Float32,
|
||||
"Historical property match field": pl.Utf8,
|
||||
"Historical property match status": pl.Utf8,
|
||||
}
|
||||
if candidates.is_empty():
|
||||
return pl.DataFrame(schema=schema)
|
||||
|
||||
buckets = _property_candidates_by_postcode(candidates)
|
||||
matches = []
|
||||
for listing in tqdm(
|
||||
listings.iter_rows(named=True),
|
||||
total=listings.height,
|
||||
desc="Matching historical properties",
|
||||
):
|
||||
postcode = listing.get("_listing_match_postcode")
|
||||
if not postcode:
|
||||
continue
|
||||
match = _best_property_candidate(listing, buckets.get(postcode, []))
|
||||
if match is not None:
|
||||
matches.append(match)
|
||||
|
||||
if not matches:
|
||||
return pl.DataFrame(schema=schema)
|
||||
return pl.DataFrame(matches, schema=schema)
|
||||
|
||||
|
||||
def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame:
|
||||
rename = {column: f"{prefix}{column}" for column in columns if column in df.columns}
|
||||
return df.rename(rename)
|
||||
|
||||
|
||||
def _ensure_prefixed_columns(
|
||||
df: pl.DataFrame, columns: list[str], prefix: str
|
||||
) -> pl.DataFrame:
|
||||
missing_exprs = [
|
||||
pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias(
|
||||
f"{prefix}{column}"
|
||||
)
|
||||
for column in columns
|
||||
if f"{prefix}{column}" not in df.columns
|
||||
]
|
||||
if not missing_exprs:
|
||||
return df
|
||||
return df.with_columns(missing_exprs)
|
||||
|
||||
|
||||
def _property_match_frame(
|
||||
matches: pl.DataFrame, candidates: pl.DataFrame
|
||||
) -> pl.DataFrame:
|
||||
if matches.is_empty():
|
||||
return matches
|
||||
selected_columns = [
|
||||
"_property_row",
|
||||
*[
|
||||
column
|
||||
for column in PROPERTY_CANDIDATE_COLUMNS
|
||||
if column in candidates.columns
|
||||
],
|
||||
]
|
||||
matched = matches.join(
|
||||
candidates.select(selected_columns), on="_property_row", how="left"
|
||||
)
|
||||
return _prefix_columns(
|
||||
matched,
|
||||
[column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns],
|
||||
"_property_",
|
||||
)
|
||||
|
||||
|
||||
def _canonical_epc_property_type_expr() -> pl.Expr:
|
||||
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
||||
["NO DATA!", "Not Recorded"]
|
||||
)
|
||||
has_epc = pl.col("epc_property_type").is_not_null()
|
||||
is_house = pl.col("epc_property_type") == "House"
|
||||
return (
|
||||
pl.when(has_epc & is_house & ~bad_built_form)
|
||||
.then(pl.col("built_form"))
|
||||
.when(has_epc)
|
||||
.then(pl.col("epc_property_type"))
|
||||
.otherwise(None)
|
||||
.replace(
|
||||
{
|
||||
"Flat": "Flats/Maisonettes",
|
||||
"Maisonette": "Flats/Maisonettes",
|
||||
"End-Terrace": "Terraced",
|
||||
"Mid-Terrace": "Terraced",
|
||||
"Enclosed End-Terrace": "Terraced",
|
||||
"Enclosed Mid-Terrace": "Terraced",
|
||||
"Bungalow": "Other",
|
||||
"Park home": "Other",
|
||||
"House": "Other",
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
||||
return (
|
||||
pl.col(column)
|
||||
.cast(pl.Utf8)
|
||||
.str.replace("England and Wales: ", "")
|
||||
.str.replace(" onwards", "")
|
||||
.str.extract(r"(\d{4})", 1)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
)
|
||||
|
||||
|
||||
def _fractional_year_expr(column: str) -> pl.Expr:
|
||||
return (
|
||||
pl.col(column).dt.year().cast(pl.Float32)
|
||||
+ (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0
|
||||
)
|
||||
|
||||
|
||||
def _load_epc_candidates(
|
||||
epc_path: Path, listing_postcodes: list[str], temp_dir: Path
|
||||
) -> pl.DataFrame:
|
||||
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
|
||||
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
|
||||
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
|
||||
)
|
||||
|
||||
epc = (
|
||||
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
|
||||
.sort("inspection_date", descending=True)
|
||||
.group_by("_epc_match_address", "_epc_match_postcode")
|
||||
.first()
|
||||
.with_columns(
|
||||
_canonical_epc_property_type_expr().alias("_epc_canonical_property_type"),
|
||||
_construction_year_expr().alias("Construction year"),
|
||||
pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES))
|
||||
.then(pl.col("current_energy_rating"))
|
||||
.otherwise(None)
|
||||
.alias("Current energy rating"),
|
||||
pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES))
|
||||
.then(pl.col("potential_energy_rating"))
|
||||
.otherwise(None)
|
||||
.alias("Potential energy rating"),
|
||||
pl.col("total_floor_area").alias("Total floor area (sqm)"),
|
||||
pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"),
|
||||
pl.col("floor_height").alias("Interior height (m)"),
|
||||
pl.col("epc_address").alias("Address per EPC"),
|
||||
)
|
||||
.drop("tenure", strict=False)
|
||||
)
|
||||
|
||||
social_tenure = (
|
||||
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
|
||||
.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
|
||||
.select("_epc_match_address", "_epc_match_postcode")
|
||||
.unique()
|
||||
.with_columns(pl.lit("Yes").alias("Former council house"))
|
||||
)
|
||||
|
||||
return (
|
||||
epc.join(
|
||||
social_tenure,
|
||||
on=["_epc_match_address", "_epc_match_postcode"],
|
||||
how="left",
|
||||
)
|
||||
.with_columns(pl.col("Former council house").fill_null("No"))
|
||||
.filter(pl.col("_epc_match_address").is_not_null())
|
||||
.with_row_index("_epc_row")
|
||||
.select(
|
||||
"_epc_row",
|
||||
"_epc_match_address",
|
||||
"_epc_match_postcode",
|
||||
"_epc_canonical_property_type",
|
||||
*EPC_ENRICHMENT_COLUMNS,
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
|
||||
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
|
||||
buckets: dict[str, list[dict]] = {}
|
||||
for row in candidates.iter_rows(named=True):
|
||||
postcode = row.get("_epc_match_postcode")
|
||||
if postcode:
|
||||
buckets.setdefault(postcode, []).append(row)
|
||||
return buckets
|
||||
|
||||
|
||||
def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
||||
query = listing.get("_listing_match_address")
|
||||
if not query:
|
||||
return None
|
||||
|
||||
listing_has_numbers = _has_number(query)
|
||||
scored: list[tuple[float, int, dict]] = []
|
||||
for candidate in candidates:
|
||||
address = candidate.get("_epc_match_address")
|
||||
if listing_has_numbers and not _numbers_compatible(query, address):
|
||||
continue
|
||||
base_score = _address_score(query, address)
|
||||
if base_score == 0:
|
||||
continue
|
||||
score = float(base_score)
|
||||
score += _enum_bonus(
|
||||
listing.get("Property type"),
|
||||
candidate.get("_epc_canonical_property_type"),
|
||||
exact=6.0,
|
||||
mismatch=-6.0,
|
||||
)
|
||||
score += _ratio_bonus(
|
||||
listing.get("Total floor area (sqm)"),
|
||||
candidate.get("Total floor area (sqm)"),
|
||||
pct=0.12,
|
||||
cap=8.0,
|
||||
)
|
||||
score += _rooms_bonus(
|
||||
listing.get("Number of bedrooms & living rooms"),
|
||||
candidate.get("Number of bedrooms & living rooms"),
|
||||
)
|
||||
scored.append((score, base_score, candidate))
|
||||
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[0], reverse=True)
|
||||
top = scored[0]
|
||||
runner_up = scored[1][0] if len(scored) > 1 else None
|
||||
margin = top[0] - runner_up if runner_up is not None else top[0]
|
||||
threshold = (
|
||||
EPC_MATCH_MIN_SCORE_WITH_NUMBERS
|
||||
if listing_has_numbers
|
||||
else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||
)
|
||||
if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN:
|
||||
return None
|
||||
return {
|
||||
"_listing_idx": listing["_listing_idx"],
|
||||
"_epc_row": top[2]["_epc_row"],
|
||||
"EPC match score": round(top[0], 1),
|
||||
"EPC address score": top[1],
|
||||
"EPC match margin": round(margin, 1),
|
||||
"EPC match status": "matched",
|
||||
}
|
||||
|
||||
|
||||
def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
|
||||
schema = {
|
||||
"_listing_idx": pl.UInt32,
|
||||
"_epc_row": pl.UInt32,
|
||||
"EPC match score": pl.Float32,
|
||||
"EPC address score": pl.Int32,
|
||||
"EPC match margin": pl.Float32,
|
||||
"EPC match status": pl.Utf8,
|
||||
}
|
||||
if candidates.is_empty():
|
||||
return pl.DataFrame(schema=schema)
|
||||
|
||||
buckets = _epc_candidates_by_postcode(candidates)
|
||||
matches = []
|
||||
for listing in tqdm(
|
||||
listings.iter_rows(named=True),
|
||||
total=listings.height,
|
||||
desc="Matching EPC certificates",
|
||||
):
|
||||
postcode = listing.get("_listing_match_postcode")
|
||||
if not postcode:
|
||||
continue
|
||||
match = _best_epc_candidate(listing, buckets.get(postcode, []))
|
||||
if match is not None:
|
||||
matches.append(match)
|
||||
|
||||
if not matches:
|
||||
return pl.DataFrame(schema=schema)
|
||||
return pl.DataFrame(matches, schema=schema)
|
||||
|
||||
|
||||
def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
|
||||
if matches.is_empty():
|
||||
return matches
|
||||
matched = matches.join(
|
||||
candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS),
|
||||
on="_epc_row",
|
||||
how="left",
|
||||
)
|
||||
return _prefix_columns(
|
||||
matched,
|
||||
[column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns],
|
||||
"_epc_",
|
||||
)
|
||||
|
||||
|
||||
def _join_postcode_features(
|
||||
listings: pl.DataFrame, postcode_features_path: Path
|
||||
) -> pl.DataFrame:
|
||||
postcode_features = pl.scan_parquet(postcode_features_path).collect(
|
||||
engine="streaming"
|
||||
)
|
||||
return listings.join(
|
||||
postcode_features, on="Postcode", how="left", suffix="_postcode"
|
||||
)
|
||||
|
||||
|
||||
def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame:
|
||||
with_columns: list[pl.Expr] = [
|
||||
pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias(
|
||||
"Actual listing enrichment version"
|
||||
),
|
||||
_coalesce_non_empty(
|
||||
"_epc_Address per EPC",
|
||||
"_property_Address per EPC",
|
||||
).alias("Address per EPC"),
|
||||
pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES))
|
||||
.then(pl.col("Property type"))
|
||||
.otherwise(pl.col("_property_Property type"))
|
||||
.alias("Property type"),
|
||||
pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES))
|
||||
.then(pl.col("Leasehold/Freehold"))
|
||||
.otherwise(pl.col("_property_Leasehold/Freehold"))
|
||||
.alias("Leasehold/Freehold"),
|
||||
pl.coalesce(
|
||||
_valid_number_expr("Total floor area (sqm)"),
|
||||
_valid_number_expr("_epc_Total floor area (sqm)"),
|
||||
_valid_number_expr("_property_Total floor area (sqm)"),
|
||||
).alias("Total floor area (sqm)"),
|
||||
pl.when(pl.col("Number of bedrooms & living rooms") > 0)
|
||||
.then(pl.col("Number of bedrooms & living rooms"))
|
||||
.otherwise(
|
||||
pl.coalesce(
|
||||
pl.col("_epc_Number of bedrooms & living rooms"),
|
||||
pl.col("_property_Number of bedrooms & living rooms"),
|
||||
)
|
||||
)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Number of bedrooms & living rooms"),
|
||||
pl.col("Asking price").alias("Estimated current price"),
|
||||
pl.col("Asking price").alias("Last known price"),
|
||||
_coalesce_non_empty(
|
||||
"_epc_Current energy rating",
|
||||
"_property_Current energy rating",
|
||||
).alias("Current energy rating"),
|
||||
_coalesce_non_empty(
|
||||
"_epc_Potential energy rating",
|
||||
"_property_Potential energy rating",
|
||||
).alias("Potential energy rating"),
|
||||
pl.coalesce(
|
||||
_valid_number_expr("_epc_Interior height (m)"),
|
||||
_valid_number_expr("_property_Interior height (m)"),
|
||||
).alias("Interior height (m)"),
|
||||
pl.coalesce(
|
||||
pl.col("_epc_Construction year"),
|
||||
pl.col("_property_Construction year"),
|
||||
)
|
||||
.cast(pl.UInt16, strict=False)
|
||||
.alias("Construction year"),
|
||||
_coalesce_non_empty(
|
||||
"_epc_Former council house",
|
||||
"_property_Former council house",
|
||||
)
|
||||
.fill_null("No")
|
||||
.alias("Former council house"),
|
||||
pl.col("_property_Is construction date approximate").alias(
|
||||
"Is construction date approximate"
|
||||
),
|
||||
pl.col("_property_Listed building").fill_null("No").alias("Listed building"),
|
||||
pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"),
|
||||
pl.col("_property_Street tree density percentile").alias(
|
||||
"Street tree density percentile"
|
||||
),
|
||||
_fractional_year_expr("_property_Date of last transaction").alias(
|
||||
"Date of last transaction"
|
||||
),
|
||||
]
|
||||
|
||||
df = df.with_columns(with_columns)
|
||||
df = df.with_columns(
|
||||
pl.when(
|
||||
pl.col("Asking price").is_not_null()
|
||||
& pl.col("Total floor area (sqm)").is_not_null()
|
||||
& (pl.col("Total floor area (sqm)") > 0)
|
||||
)
|
||||
.then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0))
|
||||
.otherwise(None)
|
||||
.cast(pl.Int32, strict=False)
|
||||
.alias("Asking price per sqm"),
|
||||
).with_columns(
|
||||
pl.col("Asking price per sqm").alias("Est. price per sqm"),
|
||||
pl.col("Asking price per sqm").alias("Price per sqm"),
|
||||
)
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame:
|
||||
internal_prefixes = ("_property_", "_epc_")
|
||||
internal_exact = {
|
||||
"_listing_idx",
|
||||
"_listing_match_address",
|
||||
"_listing_match_postcode",
|
||||
"_original_postcode",
|
||||
"_property_row",
|
||||
"_epc_row",
|
||||
"lat_postcode",
|
||||
"lon_postcode",
|
||||
}
|
||||
drop_cols = [
|
||||
column
|
||||
for column in df.columns
|
||||
if column in internal_exact or column.startswith(internal_prefixes)
|
||||
]
|
||||
return df.drop(drop_cols, strict=False)
|
||||
|
||||
|
||||
def build_enriched_actual_listings(
|
||||
listings_path: Path,
|
||||
properties_path: Path,
|
||||
postcode_features_path: Path,
|
||||
arcgis_path: Path,
|
||||
output_path: Path,
|
||||
*,
|
||||
epc_path: Path | None = None,
|
||||
) -> pl.DataFrame:
|
||||
print(f"Loading listings from {listings_path}...")
|
||||
listings = _read_listings(listings_path, arcgis_path)
|
||||
listing_postcodes = (
|
||||
listings.select("_listing_match_postcode")
|
||||
.drop_nulls()
|
||||
.unique()
|
||||
.to_series()
|
||||
.to_list()
|
||||
)
|
||||
print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}")
|
||||
|
||||
print(f"Loading property candidates from {properties_path}...")
|
||||
property_candidates = _load_property_candidates(properties_path, listing_postcodes)
|
||||
print(f"Property candidates: {property_candidates.height}")
|
||||
property_matches = _match_properties(listings, property_candidates)
|
||||
print(f"Historical property matches: {property_matches.height}")
|
||||
property_match_frame = _property_match_frame(property_matches, property_candidates)
|
||||
|
||||
enriched = _join_postcode_features(listings, postcode_features_path)
|
||||
if not property_match_frame.is_empty():
|
||||
enriched = enriched.join(property_match_frame, on="_listing_idx", how="left")
|
||||
else:
|
||||
enriched = enriched.with_columns(
|
||||
pl.lit(None, dtype=pl.Utf8).alias("Historical property match status")
|
||||
)
|
||||
|
||||
if epc_path is not None:
|
||||
with tempfile.TemporaryDirectory(
|
||||
prefix="actual_listing_epc_", dir=local_tmp_dir()
|
||||
) as tmpdir:
|
||||
print(f"Loading EPC candidates from {epc_path}...")
|
||||
epc_candidates = _load_epc_candidates(
|
||||
epc_path, listing_postcodes, Path(tmpdir)
|
||||
)
|
||||
print(f"EPC candidates: {epc_candidates.height}")
|
||||
epc_matches = _match_epc(listings, epc_candidates)
|
||||
print(f"EPC matches: {epc_matches.height}")
|
||||
epc_match_frame = _epc_match_frame(epc_matches, epc_candidates)
|
||||
if not epc_match_frame.is_empty():
|
||||
enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left")
|
||||
else:
|
||||
enriched = enriched.with_columns(
|
||||
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
|
||||
)
|
||||
else:
|
||||
enriched = enriched.with_columns(
|
||||
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
|
||||
)
|
||||
|
||||
enriched = _ensure_prefixed_columns(
|
||||
enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_"
|
||||
)
|
||||
enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_")
|
||||
enriched = _coalesce_feature_columns(enriched)
|
||||
enriched = _drop_internal_columns(enriched)
|
||||
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
enriched.write_parquet(output_path)
|
||||
size_mb = output_path.stat().st_size / (1024 * 1024)
|
||||
print(
|
||||
f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)"
|
||||
)
|
||||
return enriched
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Build a pre-enriched actual-listings parquet for the server"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--listings",
|
||||
type=Path,
|
||||
default=Path("finder/data/online_listings_buy.parquet"),
|
||||
help="Input scraped listings parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--properties",
|
||||
type=Path,
|
||||
default=Path("property-data/properties.parquet"),
|
||||
help="Historical properties parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--postcode-features",
|
||||
type=Path,
|
||||
default=Path("property-data/postcode.parquet"),
|
||||
help="Postcode feature parquet",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--arcgis",
|
||||
type=Path,
|
||||
default=Path("property-data/arcgis_data.parquet"),
|
||||
help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--epc",
|
||||
type=Path,
|
||||
default=Path("manual-data/domestic-csv.zip"),
|
||||
help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-epc",
|
||||
action="store_true",
|
||||
help="Skip direct EPC matching even when --epc exists",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=Path("finder/data/online_listings_buy_enriched.parquet"),
|
||||
help="Output enriched listings parquet",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
epc_path = None if args.no_epc else args.epc
|
||||
if epc_path is not None and not epc_path.exists():
|
||||
print(
|
||||
f"EPC source not found at {epc_path}; continuing without direct EPC matching"
|
||||
)
|
||||
epc_path = None
|
||||
|
||||
build_enriched_actual_listings(
|
||||
listings_path=args.listings,
|
||||
properties_path=args.properties,
|
||||
postcode_features_path=args.postcode_features,
|
||||
arcgis_path=args.arcgis,
|
||||
epc_path=epc_path,
|
||||
output_path=args.output,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -22,6 +22,7 @@ LISTED_BUILDING_FEATURE = "Listed building"
|
|||
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
||||
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
||||
LISTED_BUILDING_MIN_MATCH_SCORE = 95
|
||||
_UNPUBLISHED_CONSERVATION_AREA_PREFIX = "no data available for publication"
|
||||
|
||||
_IOD_PERCENTILE_COLUMNS = [
|
||||
"Education, Skills and Training Score",
|
||||
|
|
@ -429,19 +430,38 @@ def _normalise_crs(crs: object | None) -> str:
|
|||
return str(crs) if crs else "EPSG:4326"
|
||||
|
||||
|
||||
def _is_unpublished_conservation_area_record(name: object) -> bool:
|
||||
return (
|
||||
isinstance(name, str)
|
||||
and name.strip().casefold().startswith(_UNPUBLISHED_CONSERVATION_AREA_PREFIX)
|
||||
)
|
||||
|
||||
|
||||
def _load_conservation_area_geometries(
|
||||
conservation_areas_path: Path,
|
||||
) -> tuple[list[BaseGeometry], str]:
|
||||
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=[])
|
||||
metadata, table = pyogrio.read_arrow(conservation_areas_path, columns=["NAME"])
|
||||
geometry_name = metadata.get("geometry_name") or table.column_names[-1]
|
||||
names = table["NAME"].combine_chunks().to_pylist()
|
||||
geometries = []
|
||||
for geom in from_wkb(table[geometry_name].combine_chunks().to_pylist()):
|
||||
if geom is not None and not geom.is_empty:
|
||||
skipped_unpublished = 0
|
||||
for name, geom in zip(
|
||||
names, from_wkb(table[geometry_name].combine_chunks().to_pylist()), strict=True
|
||||
):
|
||||
if _is_unpublished_conservation_area_record(name):
|
||||
skipped_unpublished += 1
|
||||
elif geom is not None and not geom.is_empty:
|
||||
geometries.append(geom)
|
||||
if not geometries:
|
||||
raise ValueError(
|
||||
f"{conservation_areas_path} does not contain any usable polygon geometries"
|
||||
)
|
||||
if skipped_unpublished:
|
||||
print(
|
||||
"Skipped "
|
||||
f"{skipped_unpublished} Historic England unpublished conservation-area "
|
||||
"placeholder polygons"
|
||||
)
|
||||
return geometries, _normalise_crs(metadata.get("crs"))
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ GREENSPACE_PARK_FUNCTIONS = {
|
|||
}
|
||||
|
||||
GROCERY_DYNAMIC_FILTER_MIN_POIS = 100
|
||||
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure"}
|
||||
DYNAMIC_FILTER_ALL_GROUPS = {"Public Transport", "Leisure", "Health"}
|
||||
DYNAMIC_FILTER_COUNT_THRESHOLD_GROUPS = {"Groceries"}
|
||||
DYNAMIC_FILTER_EXCLUDED_CATEGORIES = {"Park"}
|
||||
|
||||
|
|
|
|||
143
pipeline/transform/test_enrich_actual_listings.py
Normal file
143
pipeline/transform/test_enrich_actual_listings.py
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
from pathlib import Path
|
||||
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.enrich_actual_listings import build_enriched_actual_listings
|
||||
|
||||
|
||||
def test_build_enriched_actual_listings_joins_postcode_and_property_features(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
listings_path = tmp_path / "listings.parquet"
|
||||
properties_path = tmp_path / "properties.parquet"
|
||||
postcode_path = tmp_path / "postcode.parquet"
|
||||
arcgis_path = tmp_path / "arcgis.parquet"
|
||||
output_path = tmp_path / "online_listings_buy_enriched.parquet"
|
||||
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Bedrooms": [2],
|
||||
"Bathrooms": [1],
|
||||
"Number of bedrooms & living rooms": [3],
|
||||
"lon": [-0.1],
|
||||
"lat": [51.5],
|
||||
"Postcode": ["AA1 1AB"],
|
||||
"Address per Property Register": ["1 High Street"],
|
||||
"Leasehold/Freehold": [None],
|
||||
"Property type": ["Terraced"],
|
||||
"Property sub-type": ["Terraced"],
|
||||
"Price qualifier": [""],
|
||||
"Total floor area (sqm)": [None],
|
||||
"Listing URL": ["https://example.test/listing"],
|
||||
"Listing features": [["Garden"]],
|
||||
"Listing date": [None],
|
||||
"Listing status": ["For sale"],
|
||||
"Asking price": [300_000],
|
||||
"Asking price per sqm": [None],
|
||||
},
|
||||
schema={
|
||||
"Bedrooms": pl.Int32,
|
||||
"Bathrooms": pl.Int32,
|
||||
"Number of bedrooms & living rooms": pl.Int32,
|
||||
"lon": pl.Float64,
|
||||
"lat": pl.Float64,
|
||||
"Postcode": pl.Utf8,
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Property type": pl.Utf8,
|
||||
"Property sub-type": pl.Utf8,
|
||||
"Price qualifier": pl.Utf8,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Listing URL": pl.Utf8,
|
||||
"Listing features": pl.List(pl.Utf8),
|
||||
"Listing date": pl.Datetime("us"),
|
||||
"Listing status": pl.Utf8,
|
||||
"Asking price": pl.Int64,
|
||||
"Asking price per sqm": pl.Int32,
|
||||
},
|
||||
).write_parquet(listings_path)
|
||||
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Address per Property Register": ["1 HIGH STREET"],
|
||||
"Postcode": ["AA1 1AA"],
|
||||
"Leasehold/Freehold": ["Freehold"],
|
||||
"Address per EPC": ["1 High Street"],
|
||||
"Current energy rating": ["C"],
|
||||
"Potential energy rating": ["B"],
|
||||
"Total floor area (sqm)": [80.0],
|
||||
"Number of bedrooms & living rooms": [4],
|
||||
"Interior height (m)": [2.4],
|
||||
"Construction year": [1935],
|
||||
"Former council house": ["No"],
|
||||
"Listed building": ["No"],
|
||||
"Estimated monthly rent": [1200.0],
|
||||
"Street tree density percentile": [75.0],
|
||||
"Property type": ["Terraced"],
|
||||
"Estimated current price": [310_000.0],
|
||||
},
|
||||
schema={
|
||||
"Address per Property Register": pl.Utf8,
|
||||
"Postcode": pl.Utf8,
|
||||
"Leasehold/Freehold": pl.Utf8,
|
||||
"Address per EPC": pl.Utf8,
|
||||
"Current energy rating": pl.Utf8,
|
||||
"Potential energy rating": pl.Utf8,
|
||||
"Total floor area (sqm)": pl.Float64,
|
||||
"Number of bedrooms & living rooms": pl.Int32,
|
||||
"Interior height (m)": pl.Float64,
|
||||
"Construction year": pl.UInt16,
|
||||
"Former council house": pl.Utf8,
|
||||
"Listed building": pl.Utf8,
|
||||
"Estimated monthly rent": pl.Float32,
|
||||
"Street tree density percentile": pl.Float32,
|
||||
"Property type": pl.Utf8,
|
||||
"Estimated current price": pl.Float64,
|
||||
},
|
||||
).write_parquet(properties_path)
|
||||
|
||||
pl.DataFrame(
|
||||
{
|
||||
"Postcode": ["AA1 1AA"],
|
||||
"Income Score": [82.5],
|
||||
"Within conservation area": ["Yes"],
|
||||
}
|
||||
).write_parquet(postcode_path)
|
||||
|
||||
pl.DataFrame(
|
||||
{
|
||||
"pcds": ["AA1 1AA", "AA1 1AB"],
|
||||
"ctry25cd": ["E92000001", "E92000001"],
|
||||
"doterm": [None, "202401"],
|
||||
"east1m": [100.0, 105.0],
|
||||
"north1m": [100.0, 105.0],
|
||||
},
|
||||
schema={
|
||||
"pcds": pl.Utf8,
|
||||
"ctry25cd": pl.Utf8,
|
||||
"doterm": pl.Utf8,
|
||||
"east1m": pl.Float64,
|
||||
"north1m": pl.Float64,
|
||||
},
|
||||
).write_parquet(arcgis_path)
|
||||
|
||||
result = build_enriched_actual_listings(
|
||||
listings_path=listings_path,
|
||||
properties_path=properties_path,
|
||||
postcode_features_path=postcode_path,
|
||||
arcgis_path=arcgis_path,
|
||||
output_path=output_path,
|
||||
epc_path=None,
|
||||
)
|
||||
|
||||
row = result.row(0, named=True)
|
||||
assert output_path.exists()
|
||||
assert row["Postcode"] == "AA1 1AA"
|
||||
assert row["Historical property match status"] == "matched"
|
||||
assert row["Income Score"] == 82.5
|
||||
assert row["Within conservation area"] == "Yes"
|
||||
assert row["Leasehold/Freehold"] == "Freehold"
|
||||
assert row["Total floor area (sqm)"] == 80.0
|
||||
assert row["Asking price per sqm"] == 3750
|
||||
assert row["Estimated current price"] == 300_000
|
||||
assert row["Current energy rating"] == "C"
|
||||
|
|
@ -1,14 +1,17 @@
|
|||
import polars as pl
|
||||
import pyarrow as pa
|
||||
import pytest
|
||||
from shapely import box
|
||||
from shapely import box, to_wkb
|
||||
|
||||
from pipeline.transform.merge import (
|
||||
_AREA_COLUMNS,
|
||||
CONSERVATION_AREA_FEATURE,
|
||||
LISTED_BUILDING_FEATURE,
|
||||
TREE_DENSITY_FEATURE,
|
||||
_is_unpublished_conservation_area_record,
|
||||
_is_dynamic_poi_metric_column,
|
||||
_less_deprived_percentile_expr,
|
||||
_load_conservation_area_geometries,
|
||||
_matched_listed_building_flags,
|
||||
_postcode_conservation_area_flags,
|
||||
_postcode_listed_building_candidates,
|
||||
|
|
@ -82,6 +85,45 @@ def test_postcode_conservation_area_flags_marks_point_membership() -> None:
|
|||
]
|
||||
|
||||
|
||||
def test_unpublished_conservation_area_records_are_identified() -> None:
|
||||
assert _is_unpublished_conservation_area_record(
|
||||
"No data available for publication by HE"
|
||||
)
|
||||
assert not _is_unpublished_conservation_area_record("Bloomsbury")
|
||||
assert not _is_unpublished_conservation_area_record(None)
|
||||
|
||||
|
||||
def test_load_conservation_area_geometries_skips_unpublished_placeholders(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
tmp_path,
|
||||
) -> None:
|
||||
real_area = box(0, 0, 1, 1)
|
||||
placeholder_area = box(-100, -100, 100, 100)
|
||||
|
||||
def fake_read_arrow(path, columns):
|
||||
assert path == tmp_path / "conservation_areas.gpkg"
|
||||
assert columns == ["NAME"]
|
||||
table = pa.table(
|
||||
{
|
||||
"NAME": [
|
||||
"Central Village",
|
||||
"No data available for publication by HE",
|
||||
],
|
||||
"SHAPE": to_wkb([real_area, placeholder_area]),
|
||||
}
|
||||
)
|
||||
return {"geometry_name": "SHAPE", "crs": "EPSG:4326"}, table
|
||||
|
||||
monkeypatch.setattr("pipeline.transform.merge.pyogrio.read_arrow", fake_read_arrow)
|
||||
|
||||
geometries, crs = _load_conservation_area_geometries(
|
||||
tmp_path / "conservation_areas.gpkg"
|
||||
)
|
||||
|
||||
assert crs == "EPSG:4326"
|
||||
assert geometries == [real_area]
|
||||
|
||||
|
||||
def test_postcode_listed_building_candidates_uses_nearby_postcodes() -> None:
|
||||
listed_points = pl.DataFrame(
|
||||
{
|
||||
|
|
|
|||
|
|
@ -37,6 +37,7 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
|||
assert set(display_names.values()) == {
|
||||
"Bus stop",
|
||||
"Café",
|
||||
"Pharmacy",
|
||||
"Rail station",
|
||||
"Restaurant",
|
||||
"Tesco",
|
||||
|
|
@ -44,7 +45,6 @@ def test_dynamic_poi_groups_include_requested_categories_only() -> None:
|
|||
assert "poi_waitrose" not in groups
|
||||
assert "poi_park" not in groups
|
||||
assert "poi_school" not in groups
|
||||
assert "poi_pharmacy" not in groups
|
||||
|
||||
|
||||
def test_dynamic_poi_metric_renames_support_park_count_options() -> None:
|
||||
|
|
|
|||
|
|
@ -1316,17 +1316,122 @@ def transform_grocery_retail_points(
|
|||
).select("id", "name", "category", "icon_category", "group", "lat", "lng", "emoji")
|
||||
|
||||
|
||||
def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
|
||||
"""Convert the GIAS register parquet into POI rows with school metadata."""
|
||||
return pl.scan_parquet(gias_path).select(
|
||||
SCHOOL_ICON_CATEGORIES: dict[str, str] = {
|
||||
"Nursery school": "🧸",
|
||||
"Primary school": "🎒",
|
||||
"Secondary school": "🏫",
|
||||
"All-through school": "🏫",
|
||||
"Sixth form": "📚",
|
||||
"Further education college": "📚",
|
||||
"University": "🎓",
|
||||
"Special school": "🤝",
|
||||
"School": "🏫",
|
||||
}
|
||||
|
||||
|
||||
def _school_icon_category_expr() -> pl.Expr:
|
||||
"""Pick an icon category from GIAS phase/type_group/age_range. type_group
|
||||
wins for universities, FE colleges and special schools (which span multiple
|
||||
phases); otherwise phase determines the bucket. For independent and other
|
||||
non-statutory schools where GIAS leaves phase null, fall back to the
|
||||
age_range bounds so they still split into the right pill."""
|
||||
# GIAS phase mixes casing ("Middle deemed Primary" vs "Middle deemed
|
||||
# primary") so we normalise before matching.
|
||||
phase = pl.col("phase").str.to_lowercase()
|
||||
# age_range is "<min>–<max>" using an em-dash; both ends may be missing.
|
||||
age_parts = pl.col("age_range").str.split_exact("–", 1)
|
||||
min_age = age_parts.struct.field("field_0").cast(pl.Int32, strict=False)
|
||||
max_age = age_parts.struct.field("field_1").cast(pl.Int32, strict=False)
|
||||
return (
|
||||
pl.when(pl.col("type_group") == "Universities")
|
||||
.then(pl.lit("University"))
|
||||
.when(pl.col("type_group") == "Special schools")
|
||||
.then(pl.lit("Special school"))
|
||||
.when(pl.col("type_group") == "Colleges")
|
||||
.then(pl.lit("Further education college"))
|
||||
.when(phase == "nursery")
|
||||
.then(pl.lit("Nursery school"))
|
||||
.when(phase.is_in(["primary", "middle deemed primary"]))
|
||||
.then(pl.lit("Primary school"))
|
||||
.when(phase.is_in(["secondary", "middle deemed secondary"]))
|
||||
.then(pl.lit("Secondary school"))
|
||||
.when(phase == "all-through")
|
||||
.then(pl.lit("All-through school"))
|
||||
.when(phase.is_in(["16 plus", "sixth form"]))
|
||||
.then(pl.lit("Sixth form"))
|
||||
# Age-range fallback for null-phase rows (≈3k Independents + Academies
|
||||
# GIAS doesn't classify by phase).
|
||||
.when(max_age <= 5)
|
||||
.then(pl.lit("Nursery school"))
|
||||
.when(min_age >= 16)
|
||||
.then(pl.lit("Sixth form"))
|
||||
.when((min_age <= 6) & (max_age >= 16))
|
||||
.then(pl.lit("All-through school"))
|
||||
.when(max_age <= 11)
|
||||
.then(pl.lit("Primary school"))
|
||||
.when(min_age >= 10)
|
||||
.then(pl.lit("Secondary school"))
|
||||
.otherwise(pl.lit("School"))
|
||||
)
|
||||
|
||||
|
||||
OFSTED_OEIF_LABELS = {
|
||||
"1": "Outstanding",
|
||||
"2": "Good",
|
||||
"3": "Requires improvement",
|
||||
"4": "Inadequate",
|
||||
}
|
||||
|
||||
|
||||
def _load_ofsted_ratings(ofsted_path: Path) -> pl.LazyFrame:
|
||||
"""Project the latest OEIF effectiveness grade to a human-readable label,
|
||||
keyed by URN so it can be joined onto the GIAS register. Grades 1-4 map to
|
||||
the conventional Ofsted labels; "Not judged" (post-2025 reform schools that
|
||||
only have a report card) is preserved verbatim; null grades drop out."""
|
||||
grade_col = pl.col("Latest OEIF overall effectiveness")
|
||||
label = (
|
||||
pl.when(grade_col == "1")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["1"]))
|
||||
.when(grade_col == "2")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["2"]))
|
||||
.when(grade_col == "3")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["3"]))
|
||||
.when(grade_col == "4")
|
||||
.then(pl.lit(OFSTED_OEIF_LABELS["4"]))
|
||||
.when(grade_col == "Not judged")
|
||||
.then(pl.lit("Not judged"))
|
||||
.otherwise(None)
|
||||
)
|
||||
return (
|
||||
pl.scan_parquet(ofsted_path)
|
||||
.select(
|
||||
pl.col("URN").cast(pl.Int64).alias("urn"),
|
||||
label.alias("ofsted_rating"),
|
||||
)
|
||||
.filter(pl.col("ofsted_rating").is_not_null())
|
||||
)
|
||||
|
||||
|
||||
def transform_gias_schools(gias_path: Path, ofsted_path: Path) -> pl.LazyFrame:
|
||||
"""Convert the GIAS register parquet into POI rows with school metadata.
|
||||
Ofsted ratings are joined by URN so each school carries its latest OEIF
|
||||
overall effectiveness grade (Outstanding/Good/Requires improvement/
|
||||
Inadequate/Not judged), surfaced in the map popup."""
|
||||
icon_category_expr = _school_icon_category_expr()
|
||||
emoji_expr = icon_category_expr.replace_strict(SCHOOL_ICON_CATEGORIES)
|
||||
ofsted = _load_ofsted_ratings(ofsted_path)
|
||||
# category mirrors icon_category so the dashboard renders one toggle per
|
||||
# school type (Nursery / Primary / Secondary / Sixth form / University /…)
|
||||
# instead of bundling every GIAS row under a single "School" pill.
|
||||
return pl.scan_parquet(gias_path).join(ofsted, on="urn", how="left").select(
|
||||
pl.concat_str([pl.lit("gias-"), pl.col("urn").cast(pl.String)]).alias("id"),
|
||||
pl.col("name"),
|
||||
pl.lit("School").alias("category"),
|
||||
pl.lit("School").alias("icon_category"),
|
||||
icon_category_expr.alias("category"),
|
||||
icon_category_expr.alias("icon_category"),
|
||||
pl.lit("Education").alias("group"),
|
||||
pl.col("lat").cast(pl.Float64),
|
||||
pl.col("lng").cast(pl.Float64),
|
||||
pl.lit("🏫").alias("emoji"),
|
||||
emoji_expr.alias("emoji"),
|
||||
pl.col("phase").alias("school_phase"),
|
||||
pl.col("type").alias("school_type"),
|
||||
pl.col("type_group").alias("school_type_group"),
|
||||
|
|
@ -1346,6 +1451,7 @@ def transform_gias_schools(gias_path: Path) -> pl.LazyFrame:
|
|||
pl.col("website").alias("school_website"),
|
||||
pl.col("telephone").cast(pl.String, strict=False).alias("school_telephone"),
|
||||
pl.col("head_name").alias("school_head_name"),
|
||||
pl.col("ofsted_rating").alias("school_ofsted_rating"),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -1355,6 +1461,7 @@ def transform(
|
|||
boundary_path: Path,
|
||||
grocery_retail_points_path: Path,
|
||||
gias_path: Path,
|
||||
ofsted_path: Path,
|
||||
) -> pl.LazyFrame:
|
||||
lf = pl.scan_parquet(input_path)
|
||||
|
||||
|
|
@ -1420,7 +1527,12 @@ def transform(
|
|||
|
||||
grocery_df = pl.read_parquet(grocery_retail_points_path)
|
||||
grocery_pois = transform_grocery_retail_points(grocery_df, boundary_path)
|
||||
frames = [lf, naptan, grocery_pois.lazy(), transform_gias_schools(gias_path)]
|
||||
frames = [
|
||||
lf,
|
||||
naptan,
|
||||
grocery_pois.lazy(),
|
||||
transform_gias_schools(gias_path, ofsted_path),
|
||||
]
|
||||
|
||||
return pl.concat(frames, how="diagonal_relaxed")
|
||||
|
||||
|
|
@ -1453,6 +1565,12 @@ def main():
|
|||
required=True,
|
||||
help="GIAS schools register parquet (replaces OSM schools)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ofsted",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Ofsted latest-inspections parquet (provides per-URN ratings)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", type=Path, required=True, help="Output filtered POIs parquet file"
|
||||
)
|
||||
|
|
@ -1464,6 +1582,7 @@ def main():
|
|||
args.boundary,
|
||||
args.grocery_retail_points,
|
||||
args.gias,
|
||||
args.ofsted,
|
||||
).collect(engine="streaming")
|
||||
|
||||
df.write_parquet(args.output)
|
||||
|
|
|
|||
|
|
@ -219,6 +219,8 @@ def build_tree_overlay_tiles(
|
|||
str(max_zoom),
|
||||
"--drop-smallest-as-needed",
|
||||
"--extend-zooms-if-still-dropping",
|
||||
"--temporary-directory",
|
||||
tmp,
|
||||
str(ndjson_path),
|
||||
],
|
||||
check=True,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue