960 lines
31 KiB
Python
960 lines
31 KiB
Python
import argparse
|
|
import re
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import polars as pl
|
|
from thefuzz import fuzz
|
|
from tqdm import tqdm
|
|
|
|
from pipeline.local_temp import local_tmp_dir
|
|
from pipeline.transform.join_epc_pp import _scan_epc_certificates
|
|
from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key
|
|
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|
|
|
MIN_FLOOR_AREA_M2 = 10.0
|
|
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
|
PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
|
PROPERTY_MATCH_MIN_MARGIN = 4.0
|
|
EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
|
EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
|
EPC_MATCH_MIN_MARGIN = 4.0
|
|
ENRICHMENT_VERSION = 1
|
|
|
|
_NUMBER_RE = re.compile(r"\d+")
|
|
|
|
LISTING_REQUIRED_COLUMNS = [
|
|
"Bedrooms",
|
|
"Bathrooms",
|
|
"Number of bedrooms & living rooms",
|
|
"lon",
|
|
"lat",
|
|
"Postcode",
|
|
"Address per Property Register",
|
|
"Leasehold/Freehold",
|
|
"Property type",
|
|
"Property sub-type",
|
|
"Price qualifier",
|
|
"Total floor area (sqm)",
|
|
"Listing URL",
|
|
"Listing features",
|
|
"Listing date",
|
|
"Listing status",
|
|
"Asking price",
|
|
"Asking price per sqm",
|
|
]
|
|
|
|
PROPERTY_CANDIDATE_COLUMNS = [
|
|
"Address per Property Register",
|
|
"Postcode",
|
|
"Leasehold/Freehold",
|
|
"Last known price",
|
|
"Date of last transaction",
|
|
"Address per EPC",
|
|
"Current energy rating",
|
|
"Potential energy rating",
|
|
"Total floor area (sqm)",
|
|
"Number of bedrooms & living rooms",
|
|
"Interior height (m)",
|
|
"Construction year",
|
|
"Former council house",
|
|
"Is construction date approximate",
|
|
"Listed building",
|
|
"Estimated monthly rent",
|
|
"Street tree density percentile",
|
|
"Property type",
|
|
"Price per sqm",
|
|
"Estimated current price",
|
|
"Est. price per sqm",
|
|
]
|
|
|
|
PROPERTY_ENRICHMENT_COLUMNS = [
|
|
"Address per EPC",
|
|
"Current energy rating",
|
|
"Potential energy rating",
|
|
"Interior height (m)",
|
|
"Construction year",
|
|
"Former council house",
|
|
"Is construction date approximate",
|
|
"Listed building",
|
|
"Estimated monthly rent",
|
|
"Street tree density percentile",
|
|
"Date of last transaction",
|
|
]
|
|
|
|
EPC_ENRICHMENT_COLUMNS = [
|
|
"Address per EPC",
|
|
"Current energy rating",
|
|
"Potential energy rating",
|
|
"Total floor area (sqm)",
|
|
"Number of bedrooms & living rooms",
|
|
"Interior height (m)",
|
|
"Construction year",
|
|
"Former council house",
|
|
]
|
|
|
|
EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
|
|
TENURE_VALUES = ["Freehold", "Leasehold"]
|
|
PROPERTY_TYPE_VALUES = [
|
|
"Detached",
|
|
"Semi-Detached",
|
|
"Terraced",
|
|
"Flats/Maisonettes",
|
|
"Other",
|
|
]
|
|
|
|
COLUMN_DTYPES = {
|
|
"Address per EPC": pl.Utf8,
|
|
"Current energy rating": pl.Utf8,
|
|
"Potential energy rating": pl.Utf8,
|
|
"Total floor area (sqm)": pl.Float64,
|
|
"Number of bedrooms & living rooms": pl.Int32,
|
|
"Interior height (m)": pl.Float64,
|
|
"Construction year": pl.UInt16,
|
|
"Former council house": pl.Utf8,
|
|
"Is construction date approximate": pl.UInt8,
|
|
"Listed building": pl.Utf8,
|
|
"Estimated monthly rent": pl.Float32,
|
|
"Street tree density percentile": pl.Float32,
|
|
"Date of last transaction": pl.Datetime("us"),
|
|
"Property type": pl.Utf8,
|
|
"Leasehold/Freehold": pl.Utf8,
|
|
}
|
|
|
|
|
|
def _canonical_postcode_expr(column: str) -> pl.Expr:
|
|
compact = (
|
|
pl.col(column)
|
|
.cast(pl.Utf8)
|
|
.str.to_uppercase()
|
|
.str.replace_all(r"[^A-Z0-9]+", "")
|
|
.str.strip_chars()
|
|
)
|
|
return (
|
|
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
|
|
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
|
|
.otherwise(None)
|
|
)
|
|
|
|
|
|
def _clean_string_expr(column: str) -> pl.Expr:
|
|
stripped = pl.col(column).cast(pl.Utf8).str.strip_chars()
|
|
return pl.when(stripped == "").then(None).otherwise(stripped)
|
|
|
|
|
|
def _coalesce_non_empty(*columns: str) -> pl.Expr:
|
|
return pl.coalesce(
|
|
[
|
|
pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "")
|
|
.then(None)
|
|
.otherwise(pl.col(column).cast(pl.Utf8))
|
|
for column in columns
|
|
]
|
|
)
|
|
|
|
|
|
def _valid_number_expr(column: str) -> pl.Expr:
|
|
return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None)
|
|
|
|
|
|
def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
|
|
schema = pl.scan_parquet(listings_path).collect_schema()
|
|
missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names()))
|
|
if missing:
|
|
raise ValueError(f"{listings_path} is missing listing columns: {missing}")
|
|
|
|
listings = (
|
|
pl.scan_parquet(listings_path)
|
|
.with_row_index("_listing_idx")
|
|
.with_columns(
|
|
_canonical_postcode_expr("Postcode").alias("_original_postcode"),
|
|
normalize_address_key(pl.col("Address per Property Register")).alias(
|
|
"_listing_match_address"
|
|
),
|
|
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
postcode_mapping = build_postcode_mapping(arcgis_path)
|
|
listings = (
|
|
listings.join(
|
|
postcode_mapping,
|
|
left_on="_original_postcode",
|
|
right_on="old_postcode",
|
|
how="left",
|
|
)
|
|
.with_columns(
|
|
pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias(
|
|
"Postcode"
|
|
),
|
|
)
|
|
.drop("new_postcode", strict=False)
|
|
.with_columns(
|
|
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
|
|
)
|
|
)
|
|
return listings
|
|
|
|
|
|
def _load_property_candidates(
|
|
properties_path: Path, listing_postcodes: list[str]
|
|
) -> pl.DataFrame:
|
|
schema = pl.scan_parquet(properties_path).collect_schema()
|
|
columns = [
|
|
column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names()
|
|
]
|
|
missing = sorted(
|
|
set(
|
|
[
|
|
"Address per Property Register",
|
|
"Postcode",
|
|
"Property type",
|
|
"Total floor area (sqm)",
|
|
]
|
|
)
|
|
- set(columns)
|
|
)
|
|
if missing:
|
|
raise ValueError(f"{properties_path} is missing property columns: {missing}")
|
|
|
|
return (
|
|
pl.scan_parquet(properties_path)
|
|
.select(columns)
|
|
.with_columns(
|
|
normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode")
|
|
)
|
|
.filter(pl.col("_match_postcode").is_in(listing_postcodes))
|
|
.with_columns(
|
|
normalize_address_key(pl.col("Address per Property Register")).alias(
|
|
"_match_register_address"
|
|
),
|
|
normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address")
|
|
if "Address per EPC" in columns
|
|
else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"),
|
|
)
|
|
.filter(
|
|
pl.col("_match_register_address").is_not_null()
|
|
| pl.col("_match_epc_address").is_not_null()
|
|
)
|
|
.with_row_index("_property_row")
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
|
|
def _property_candidates_by_postcode(
|
|
candidates: pl.DataFrame,
|
|
) -> dict[str, list[dict]]:
|
|
buckets: dict[str, list[dict]] = {}
|
|
for row in candidates.iter_rows(named=True):
|
|
postcode = row.get("_match_postcode")
|
|
if postcode:
|
|
buckets.setdefault(postcode, []).append(row)
|
|
return buckets
|
|
|
|
|
|
def _numbers_compatible(left: str | None, right: str | None) -> bool:
|
|
if not left or not right:
|
|
return False
|
|
left_nums = set(_NUMBER_RE.findall(left))
|
|
right_nums = set(_NUMBER_RE.findall(right))
|
|
smaller, larger = (
|
|
(left_nums, right_nums)
|
|
if len(left_nums) <= len(right_nums)
|
|
else (right_nums, left_nums)
|
|
)
|
|
if not smaller and larger:
|
|
return False
|
|
return smaller.issubset(larger)
|
|
|
|
|
|
def _has_number(address: str | None) -> bool:
|
|
return bool(address and _NUMBER_RE.search(address))
|
|
|
|
|
|
def _ratio_bonus(
|
|
left: float | int | None, right: float | int | None, pct: float, cap: float
|
|
) -> float:
|
|
if left is None or right is None:
|
|
return 0.0
|
|
try:
|
|
left_f = float(left)
|
|
right_f = float(right)
|
|
except (TypeError, ValueError):
|
|
return 0.0
|
|
if left_f <= 0 or right_f <= 0:
|
|
return 0.0
|
|
rel = abs(left_f - right_f) / max(left_f, right_f)
|
|
if rel > pct:
|
|
return 0.0
|
|
return cap * (1.0 - rel / pct)
|
|
|
|
|
|
def _rooms_bonus(left: int | None, right: int | None) -> float:
|
|
if left is None or right is None:
|
|
return 0.0
|
|
try:
|
|
diff = abs(int(left) - int(right))
|
|
except (TypeError, ValueError):
|
|
return 0.0
|
|
if diff == 0:
|
|
return 4.0
|
|
if diff == 1:
|
|
return 2.0
|
|
return 0.0
|
|
|
|
|
|
def _enum_bonus(
|
|
left: str | None, right: str | None, *, exact: float, mismatch: float
|
|
) -> float:
|
|
if not left or not right:
|
|
return 0.0
|
|
return exact if left == right else mismatch
|
|
|
|
|
|
def _address_score(query: str, candidate: str | None) -> int:
|
|
if not candidate:
|
|
return 0
|
|
return max(
|
|
fuzz.token_set_ratio(query, candidate),
|
|
fuzz.token_sort_ratio(query, candidate),
|
|
)
|
|
|
|
|
|
def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
|
query = listing.get("_listing_match_address")
|
|
if not query:
|
|
return None
|
|
|
|
listing_has_numbers = _has_number(query)
|
|
scored: list[tuple[float, int, dict, str]] = []
|
|
for candidate in candidates:
|
|
register_address = candidate.get("_match_register_address")
|
|
epc_address = candidate.get("_match_epc_address")
|
|
if listing_has_numbers and not (
|
|
_numbers_compatible(query, register_address)
|
|
or _numbers_compatible(query, epc_address)
|
|
):
|
|
continue
|
|
|
|
register_score = _address_score(query, register_address)
|
|
epc_score = _address_score(query, epc_address)
|
|
base_score = max(register_score, epc_score)
|
|
if base_score == 0:
|
|
continue
|
|
|
|
score = float(base_score)
|
|
score += _enum_bonus(
|
|
listing.get("Property type"),
|
|
candidate.get("Property type"),
|
|
exact=7.0,
|
|
mismatch=-8.0,
|
|
)
|
|
score += _enum_bonus(
|
|
listing.get("Leasehold/Freehold"),
|
|
candidate.get("Leasehold/Freehold"),
|
|
exact=3.0,
|
|
mismatch=-3.0,
|
|
)
|
|
score += _ratio_bonus(
|
|
listing.get("Total floor area (sqm)"),
|
|
candidate.get("Total floor area (sqm)"),
|
|
pct=0.15,
|
|
cap=8.0,
|
|
)
|
|
score += _rooms_bonus(
|
|
listing.get("Number of bedrooms & living rooms"),
|
|
candidate.get("Number of bedrooms & living rooms"),
|
|
)
|
|
score += _ratio_bonus(
|
|
listing.get("Asking price"),
|
|
candidate.get("Estimated current price")
|
|
or candidate.get("Last known price"),
|
|
pct=0.25,
|
|
cap=3.0,
|
|
)
|
|
matched_address = (
|
|
"Address per Property Register"
|
|
if register_score >= epc_score
|
|
else "Address per EPC"
|
|
)
|
|
scored.append((score, base_score, candidate, matched_address))
|
|
|
|
if not scored:
|
|
return None
|
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
top = scored[0]
|
|
runner_up = scored[1][0] if len(scored) > 1 else None
|
|
margin = top[0] - runner_up if runner_up is not None else top[0]
|
|
threshold = (
|
|
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
|
|
if listing_has_numbers
|
|
else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
|
)
|
|
if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN:
|
|
return None
|
|
|
|
return {
|
|
"_listing_idx": listing["_listing_idx"],
|
|
"_property_row": top[2]["_property_row"],
|
|
"Historical property match score": round(top[0], 1),
|
|
"Historical property address score": top[1],
|
|
"Historical property match margin": round(margin, 1),
|
|
"Historical property match field": top[3],
|
|
"Historical property match status": "matched",
|
|
}
|
|
|
|
|
|
def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
|
|
schema = {
|
|
"_listing_idx": pl.UInt32,
|
|
"_property_row": pl.UInt32,
|
|
"Historical property match score": pl.Float32,
|
|
"Historical property address score": pl.Int32,
|
|
"Historical property match margin": pl.Float32,
|
|
"Historical property match field": pl.Utf8,
|
|
"Historical property match status": pl.Utf8,
|
|
}
|
|
if candidates.is_empty():
|
|
return pl.DataFrame(schema=schema)
|
|
|
|
buckets = _property_candidates_by_postcode(candidates)
|
|
matches = []
|
|
for listing in tqdm(
|
|
listings.iter_rows(named=True),
|
|
total=listings.height,
|
|
desc="Matching historical properties",
|
|
):
|
|
postcode = listing.get("_listing_match_postcode")
|
|
if not postcode:
|
|
continue
|
|
match = _best_property_candidate(listing, buckets.get(postcode, []))
|
|
if match is not None:
|
|
matches.append(match)
|
|
|
|
if not matches:
|
|
return pl.DataFrame(schema=schema)
|
|
return pl.DataFrame(matches, schema=schema)
|
|
|
|
|
|
def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame:
|
|
rename = {column: f"{prefix}{column}" for column in columns if column in df.columns}
|
|
return df.rename(rename)
|
|
|
|
|
|
def _ensure_prefixed_columns(
|
|
df: pl.DataFrame, columns: list[str], prefix: str
|
|
) -> pl.DataFrame:
|
|
missing_exprs = [
|
|
pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias(
|
|
f"{prefix}{column}"
|
|
)
|
|
for column in columns
|
|
if f"{prefix}{column}" not in df.columns
|
|
]
|
|
if not missing_exprs:
|
|
return df
|
|
return df.with_columns(missing_exprs)
|
|
|
|
|
|
def _property_match_frame(
|
|
matches: pl.DataFrame, candidates: pl.DataFrame
|
|
) -> pl.DataFrame:
|
|
if matches.is_empty():
|
|
return matches
|
|
selected_columns = [
|
|
"_property_row",
|
|
*[
|
|
column
|
|
for column in PROPERTY_CANDIDATE_COLUMNS
|
|
if column in candidates.columns
|
|
],
|
|
]
|
|
matched = matches.join(
|
|
candidates.select(selected_columns), on="_property_row", how="left"
|
|
)
|
|
return _prefix_columns(
|
|
matched,
|
|
[column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns],
|
|
"_property_",
|
|
)
|
|
|
|
|
|
def _canonical_epc_property_type_expr() -> pl.Expr:
|
|
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
|
["NO DATA!", "Not Recorded"]
|
|
)
|
|
has_epc = pl.col("epc_property_type").is_not_null()
|
|
is_house = pl.col("epc_property_type") == "House"
|
|
return (
|
|
pl.when(has_epc & is_house & ~bad_built_form)
|
|
.then(pl.col("built_form"))
|
|
.when(has_epc)
|
|
.then(pl.col("epc_property_type"))
|
|
.otherwise(None)
|
|
.replace(
|
|
{
|
|
"Flat": "Flats/Maisonettes",
|
|
"Maisonette": "Flats/Maisonettes",
|
|
"End-Terrace": "Terraced",
|
|
"Mid-Terrace": "Terraced",
|
|
"Enclosed End-Terrace": "Terraced",
|
|
"Enclosed Mid-Terrace": "Terraced",
|
|
"Bungalow": "Other",
|
|
"Park home": "Other",
|
|
"House": "Other",
|
|
}
|
|
)
|
|
)
|
|
|
|
|
|
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
|
return (
|
|
pl.col(column)
|
|
.cast(pl.Utf8)
|
|
.str.replace("England and Wales: ", "")
|
|
.str.replace(" onwards", "")
|
|
.str.extract(r"(\d{4})", 1)
|
|
.cast(pl.UInt16, strict=False)
|
|
)
|
|
|
|
|
|
def _fractional_year_expr(column: str) -> pl.Expr:
|
|
return (
|
|
pl.col(column).dt.year().cast(pl.Float32)
|
|
+ (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0
|
|
)
|
|
|
|
|
|
def _load_epc_candidates(
|
|
epc_path: Path, listing_postcodes: list[str], temp_dir: Path
|
|
) -> pl.DataFrame:
|
|
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
|
|
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
|
|
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
|
|
)
|
|
|
|
epc = (
|
|
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
|
|
.sort("inspection_date", descending=True)
|
|
.group_by("_epc_match_address", "_epc_match_postcode")
|
|
.first()
|
|
.with_columns(
|
|
_canonical_epc_property_type_expr().alias("_epc_canonical_property_type"),
|
|
_construction_year_expr().alias("Construction year"),
|
|
pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES))
|
|
.then(pl.col("current_energy_rating"))
|
|
.otherwise(None)
|
|
.alias("Current energy rating"),
|
|
pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES))
|
|
.then(pl.col("potential_energy_rating"))
|
|
.otherwise(None)
|
|
.alias("Potential energy rating"),
|
|
pl.col("total_floor_area").alias("Total floor area (sqm)"),
|
|
pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"),
|
|
pl.col("floor_height").alias("Interior height (m)"),
|
|
pl.col("epc_address").alias("Address per EPC"),
|
|
)
|
|
.drop("tenure", strict=False)
|
|
)
|
|
|
|
social_tenure = (
|
|
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
|
|
.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
|
|
.select("_epc_match_address", "_epc_match_postcode")
|
|
.unique()
|
|
.with_columns(pl.lit("Yes").alias("Former council house"))
|
|
)
|
|
|
|
return (
|
|
epc.join(
|
|
social_tenure,
|
|
on=["_epc_match_address", "_epc_match_postcode"],
|
|
how="left",
|
|
)
|
|
.with_columns(pl.col("Former council house").fill_null("No"))
|
|
.filter(pl.col("_epc_match_address").is_not_null())
|
|
.with_row_index("_epc_row")
|
|
.select(
|
|
"_epc_row",
|
|
"_epc_match_address",
|
|
"_epc_match_postcode",
|
|
"_epc_canonical_property_type",
|
|
*EPC_ENRICHMENT_COLUMNS,
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
|
|
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
|
|
buckets: dict[str, list[dict]] = {}
|
|
for row in candidates.iter_rows(named=True):
|
|
postcode = row.get("_epc_match_postcode")
|
|
if postcode:
|
|
buckets.setdefault(postcode, []).append(row)
|
|
return buckets
|
|
|
|
|
|
def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
|
query = listing.get("_listing_match_address")
|
|
if not query:
|
|
return None
|
|
|
|
listing_has_numbers = _has_number(query)
|
|
scored: list[tuple[float, int, dict]] = []
|
|
for candidate in candidates:
|
|
address = candidate.get("_epc_match_address")
|
|
if listing_has_numbers and not _numbers_compatible(query, address):
|
|
continue
|
|
base_score = _address_score(query, address)
|
|
if base_score == 0:
|
|
continue
|
|
score = float(base_score)
|
|
score += _enum_bonus(
|
|
listing.get("Property type"),
|
|
candidate.get("_epc_canonical_property_type"),
|
|
exact=6.0,
|
|
mismatch=-6.0,
|
|
)
|
|
score += _ratio_bonus(
|
|
listing.get("Total floor area (sqm)"),
|
|
candidate.get("Total floor area (sqm)"),
|
|
pct=0.12,
|
|
cap=8.0,
|
|
)
|
|
score += _rooms_bonus(
|
|
listing.get("Number of bedrooms & living rooms"),
|
|
candidate.get("Number of bedrooms & living rooms"),
|
|
)
|
|
scored.append((score, base_score, candidate))
|
|
|
|
if not scored:
|
|
return None
|
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
top = scored[0]
|
|
runner_up = scored[1][0] if len(scored) > 1 else None
|
|
margin = top[0] - runner_up if runner_up is not None else top[0]
|
|
threshold = (
|
|
EPC_MATCH_MIN_SCORE_WITH_NUMBERS
|
|
if listing_has_numbers
|
|
else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
|
)
|
|
if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN:
|
|
return None
|
|
return {
|
|
"_listing_idx": listing["_listing_idx"],
|
|
"_epc_row": top[2]["_epc_row"],
|
|
"EPC match score": round(top[0], 1),
|
|
"EPC address score": top[1],
|
|
"EPC match margin": round(margin, 1),
|
|
"EPC match status": "matched",
|
|
}
|
|
|
|
|
|
def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
|
|
schema = {
|
|
"_listing_idx": pl.UInt32,
|
|
"_epc_row": pl.UInt32,
|
|
"EPC match score": pl.Float32,
|
|
"EPC address score": pl.Int32,
|
|
"EPC match margin": pl.Float32,
|
|
"EPC match status": pl.Utf8,
|
|
}
|
|
if candidates.is_empty():
|
|
return pl.DataFrame(schema=schema)
|
|
|
|
buckets = _epc_candidates_by_postcode(candidates)
|
|
matches = []
|
|
for listing in tqdm(
|
|
listings.iter_rows(named=True),
|
|
total=listings.height,
|
|
desc="Matching EPC certificates",
|
|
):
|
|
postcode = listing.get("_listing_match_postcode")
|
|
if not postcode:
|
|
continue
|
|
match = _best_epc_candidate(listing, buckets.get(postcode, []))
|
|
if match is not None:
|
|
matches.append(match)
|
|
|
|
if not matches:
|
|
return pl.DataFrame(schema=schema)
|
|
return pl.DataFrame(matches, schema=schema)
|
|
|
|
|
|
def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
|
|
if matches.is_empty():
|
|
return matches
|
|
matched = matches.join(
|
|
candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS),
|
|
on="_epc_row",
|
|
how="left",
|
|
)
|
|
return _prefix_columns(
|
|
matched,
|
|
[column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns],
|
|
"_epc_",
|
|
)
|
|
|
|
|
|
def _join_postcode_features(
|
|
listings: pl.DataFrame, postcode_features_path: Path
|
|
) -> pl.DataFrame:
|
|
postcode_features = pl.scan_parquet(postcode_features_path).collect(
|
|
engine="streaming"
|
|
)
|
|
return listings.join(
|
|
postcode_features, on="Postcode", how="left", suffix="_postcode"
|
|
)
|
|
|
|
|
|
def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame:
|
|
with_columns: list[pl.Expr] = [
|
|
pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias(
|
|
"Actual listing enrichment version"
|
|
),
|
|
_coalesce_non_empty(
|
|
"_epc_Address per EPC",
|
|
"_property_Address per EPC",
|
|
).alias("Address per EPC"),
|
|
pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES))
|
|
.then(pl.col("Property type"))
|
|
.otherwise(pl.col("_property_Property type"))
|
|
.alias("Property type"),
|
|
pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES))
|
|
.then(pl.col("Leasehold/Freehold"))
|
|
.otherwise(pl.col("_property_Leasehold/Freehold"))
|
|
.alias("Leasehold/Freehold"),
|
|
pl.coalesce(
|
|
_valid_number_expr("Total floor area (sqm)"),
|
|
_valid_number_expr("_epc_Total floor area (sqm)"),
|
|
_valid_number_expr("_property_Total floor area (sqm)"),
|
|
).alias("Total floor area (sqm)"),
|
|
pl.when(pl.col("Number of bedrooms & living rooms") > 0)
|
|
.then(pl.col("Number of bedrooms & living rooms"))
|
|
.otherwise(
|
|
pl.coalesce(
|
|
pl.col("_epc_Number of bedrooms & living rooms"),
|
|
pl.col("_property_Number of bedrooms & living rooms"),
|
|
)
|
|
)
|
|
.cast(pl.Int32, strict=False)
|
|
.alias("Number of bedrooms & living rooms"),
|
|
pl.col("Asking price").alias("Estimated current price"),
|
|
pl.col("Asking price").alias("Last known price"),
|
|
_coalesce_non_empty(
|
|
"_epc_Current energy rating",
|
|
"_property_Current energy rating",
|
|
).alias("Current energy rating"),
|
|
_coalesce_non_empty(
|
|
"_epc_Potential energy rating",
|
|
"_property_Potential energy rating",
|
|
).alias("Potential energy rating"),
|
|
pl.coalesce(
|
|
_valid_number_expr("_epc_Interior height (m)"),
|
|
_valid_number_expr("_property_Interior height (m)"),
|
|
).alias("Interior height (m)"),
|
|
pl.coalesce(
|
|
pl.col("_epc_Construction year"),
|
|
pl.col("_property_Construction year"),
|
|
)
|
|
.cast(pl.UInt16, strict=False)
|
|
.alias("Construction year"),
|
|
_coalesce_non_empty(
|
|
"_epc_Former council house",
|
|
"_property_Former council house",
|
|
)
|
|
.fill_null("No")
|
|
.alias("Former council house"),
|
|
pl.col("_property_Is construction date approximate").alias(
|
|
"Is construction date approximate"
|
|
),
|
|
pl.col("_property_Listed building").fill_null("No").alias("Listed building"),
|
|
pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"),
|
|
pl.col("_property_Street tree density percentile").alias(
|
|
"Street tree density percentile"
|
|
),
|
|
_fractional_year_expr("_property_Date of last transaction").alias(
|
|
"Date of last transaction"
|
|
),
|
|
]
|
|
|
|
df = df.with_columns(with_columns)
|
|
df = df.with_columns(
|
|
pl.when(
|
|
pl.col("Asking price").is_not_null()
|
|
& pl.col("Total floor area (sqm)").is_not_null()
|
|
& (pl.col("Total floor area (sqm)") > 0)
|
|
)
|
|
.then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0))
|
|
.otherwise(None)
|
|
.cast(pl.Int32, strict=False)
|
|
.alias("Asking price per sqm"),
|
|
).with_columns(
|
|
pl.col("Asking price per sqm").alias("Est. price per sqm"),
|
|
pl.col("Asking price per sqm").alias("Price per sqm"),
|
|
)
|
|
|
|
return df
|
|
|
|
|
|
def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame:
|
|
internal_prefixes = ("_property_", "_epc_")
|
|
internal_exact = {
|
|
"_listing_idx",
|
|
"_listing_match_address",
|
|
"_listing_match_postcode",
|
|
"_original_postcode",
|
|
"_property_row",
|
|
"_epc_row",
|
|
"lat_postcode",
|
|
"lon_postcode",
|
|
}
|
|
drop_cols = [
|
|
column
|
|
for column in df.columns
|
|
if column in internal_exact or column.startswith(internal_prefixes)
|
|
]
|
|
return df.drop(drop_cols, strict=False)
|
|
|
|
|
|
def build_enriched_actual_listings(
|
|
listings_path: Path,
|
|
properties_path: Path,
|
|
postcode_features_path: Path,
|
|
arcgis_path: Path,
|
|
output_path: Path,
|
|
*,
|
|
epc_path: Path | None = None,
|
|
) -> pl.DataFrame:
|
|
print(f"Loading listings from {listings_path}...")
|
|
listings = _read_listings(listings_path, arcgis_path)
|
|
listing_postcodes = (
|
|
listings.select("_listing_match_postcode")
|
|
.drop_nulls()
|
|
.unique()
|
|
.to_series()
|
|
.to_list()
|
|
)
|
|
print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}")
|
|
|
|
print(f"Loading property candidates from {properties_path}...")
|
|
property_candidates = _load_property_candidates(properties_path, listing_postcodes)
|
|
print(f"Property candidates: {property_candidates.height}")
|
|
property_matches = _match_properties(listings, property_candidates)
|
|
print(f"Historical property matches: {property_matches.height}")
|
|
property_match_frame = _property_match_frame(property_matches, property_candidates)
|
|
|
|
enriched = _join_postcode_features(listings, postcode_features_path)
|
|
if not property_match_frame.is_empty():
|
|
enriched = enriched.join(property_match_frame, on="_listing_idx", how="left")
|
|
else:
|
|
enriched = enriched.with_columns(
|
|
pl.lit(None, dtype=pl.Utf8).alias("Historical property match status")
|
|
)
|
|
|
|
if epc_path is not None:
|
|
with tempfile.TemporaryDirectory(
|
|
prefix="actual_listing_epc_", dir=local_tmp_dir()
|
|
) as tmpdir:
|
|
print(f"Loading EPC candidates from {epc_path}...")
|
|
epc_candidates = _load_epc_candidates(
|
|
epc_path, listing_postcodes, Path(tmpdir)
|
|
)
|
|
print(f"EPC candidates: {epc_candidates.height}")
|
|
epc_matches = _match_epc(listings, epc_candidates)
|
|
print(f"EPC matches: {epc_matches.height}")
|
|
epc_match_frame = _epc_match_frame(epc_matches, epc_candidates)
|
|
if not epc_match_frame.is_empty():
|
|
enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left")
|
|
else:
|
|
enriched = enriched.with_columns(
|
|
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
|
|
)
|
|
else:
|
|
enriched = enriched.with_columns(
|
|
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
|
|
)
|
|
|
|
enriched = _ensure_prefixed_columns(
|
|
enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_"
|
|
)
|
|
enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_")
|
|
enriched = _coalesce_feature_columns(enriched)
|
|
enriched = _drop_internal_columns(enriched)
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
enriched.write_parquet(output_path)
|
|
size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
print(
|
|
f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)"
|
|
)
|
|
return enriched
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Build a pre-enriched actual-listings parquet for the server"
|
|
)
|
|
parser.add_argument(
|
|
"--listings",
|
|
type=Path,
|
|
default=Path("finder/data/online_listings_buy.parquet"),
|
|
help="Input scraped listings parquet",
|
|
)
|
|
parser.add_argument(
|
|
"--properties",
|
|
type=Path,
|
|
default=Path("property-data/properties.parquet"),
|
|
help="Historical properties parquet",
|
|
)
|
|
parser.add_argument(
|
|
"--postcode-features",
|
|
type=Path,
|
|
default=Path("property-data/postcode.parquet"),
|
|
help="Postcode feature parquet",
|
|
)
|
|
parser.add_argument(
|
|
"--arcgis",
|
|
type=Path,
|
|
default=Path("property-data/arcgis_data.parquet"),
|
|
help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping",
|
|
)
|
|
parser.add_argument(
|
|
"--epc",
|
|
type=Path,
|
|
default=Path("manual-data/domestic-csv.zip"),
|
|
help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching",
|
|
)
|
|
parser.add_argument(
|
|
"--no-epc",
|
|
action="store_true",
|
|
help="Skip direct EPC matching even when --epc exists",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=Path,
|
|
default=Path("finder/data/online_listings_buy_enriched.parquet"),
|
|
help="Output enriched listings parquet",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
epc_path = None if args.no_epc else args.epc
|
|
if epc_path is not None and not epc_path.exists():
|
|
print(
|
|
f"EPC source not found at {epc_path}; continuing without direct EPC matching"
|
|
)
|
|
epc_path = None
|
|
|
|
build_enriched_actual_listings(
|
|
listings_path=args.listings,
|
|
properties_path=args.properties,
|
|
postcode_features_path=args.postcode_features,
|
|
arcgis_path=args.arcgis,
|
|
epc_path=epc_path,
|
|
output_path=args.output,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|