perfect-postcode/pipeline/transform/enrich_actual_listings.py
2026-05-26 19:45:13 +01:00

960 lines
31 KiB
Python

import argparse
import re
import tempfile
from pathlib import Path
import polars as pl
from thefuzz import fuzz
from tqdm import tqdm
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.join_epc_pp import _scan_epc_certificates
from pipeline.utils.fuzzy_join import normalize_address_key, normalize_postcode_key
from pipeline.utils.postcode_mapping import build_postcode_mapping
MIN_FLOOR_AREA_M2 = 10.0
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
PROPERTY_MATCH_MIN_MARGIN = 4.0
EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
EPC_MATCH_MIN_MARGIN = 4.0
ENRICHMENT_VERSION = 1
_NUMBER_RE = re.compile(r"\d+")
LISTING_REQUIRED_COLUMNS = [
"Bedrooms",
"Bathrooms",
"Number of bedrooms & living rooms",
"lon",
"lat",
"Postcode",
"Address per Property Register",
"Leasehold/Freehold",
"Property type",
"Property sub-type",
"Price qualifier",
"Total floor area (sqm)",
"Listing URL",
"Listing features",
"Listing date",
"Listing status",
"Asking price",
"Asking price per sqm",
]
PROPERTY_CANDIDATE_COLUMNS = [
"Address per Property Register",
"Postcode",
"Leasehold/Freehold",
"Last known price",
"Date of last transaction",
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Interior height (m)",
"Construction year",
"Former council house",
"Is construction date approximate",
"Listed building",
"Estimated monthly rent",
"Street tree density percentile",
"Property type",
"Price per sqm",
"Estimated current price",
"Est. price per sqm",
]
PROPERTY_ENRICHMENT_COLUMNS = [
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Interior height (m)",
"Construction year",
"Former council house",
"Is construction date approximate",
"Listed building",
"Estimated monthly rent",
"Street tree density percentile",
"Date of last transaction",
]
EPC_ENRICHMENT_COLUMNS = [
"Address per EPC",
"Current energy rating",
"Potential energy rating",
"Total floor area (sqm)",
"Number of bedrooms & living rooms",
"Interior height (m)",
"Construction year",
"Former council house",
]
EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
TENURE_VALUES = ["Freehold", "Leasehold"]
PROPERTY_TYPE_VALUES = [
"Detached",
"Semi-Detached",
"Terraced",
"Flats/Maisonettes",
"Other",
]
COLUMN_DTYPES = {
"Address per EPC": pl.Utf8,
"Current energy rating": pl.Utf8,
"Potential energy rating": pl.Utf8,
"Total floor area (sqm)": pl.Float64,
"Number of bedrooms & living rooms": pl.Int32,
"Interior height (m)": pl.Float64,
"Construction year": pl.UInt16,
"Former council house": pl.Utf8,
"Is construction date approximate": pl.UInt8,
"Listed building": pl.Utf8,
"Estimated monthly rent": pl.Float32,
"Street tree density percentile": pl.Float32,
"Date of last transaction": pl.Datetime("us"),
"Property type": pl.Utf8,
"Leasehold/Freehold": pl.Utf8,
}
def _canonical_postcode_expr(column: str) -> pl.Expr:
compact = (
pl.col(column)
.cast(pl.Utf8)
.str.to_uppercase()
.str.replace_all(r"[^A-Z0-9]+", "")
.str.strip_chars()
)
return (
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
.otherwise(None)
)
def _clean_string_expr(column: str) -> pl.Expr:
stripped = pl.col(column).cast(pl.Utf8).str.strip_chars()
return pl.when(stripped == "").then(None).otherwise(stripped)
def _coalesce_non_empty(*columns: str) -> pl.Expr:
return pl.coalesce(
[
pl.when(pl.col(column).cast(pl.Utf8).str.strip_chars() == "")
.then(None)
.otherwise(pl.col(column).cast(pl.Utf8))
for column in columns
]
)
def _valid_number_expr(column: str) -> pl.Expr:
return pl.when(pl.col(column).is_finite()).then(pl.col(column)).otherwise(None)
def _read_listings(listings_path: Path, arcgis_path: Path) -> pl.DataFrame:
schema = pl.scan_parquet(listings_path).collect_schema()
missing = sorted(set(LISTING_REQUIRED_COLUMNS) - set(schema.names()))
if missing:
raise ValueError(f"{listings_path} is missing listing columns: {missing}")
listings = (
pl.scan_parquet(listings_path)
.with_row_index("_listing_idx")
.with_columns(
_canonical_postcode_expr("Postcode").alias("_original_postcode"),
normalize_address_key(pl.col("Address per Property Register")).alias(
"_listing_match_address"
),
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
)
.collect(engine="streaming")
)
postcode_mapping = build_postcode_mapping(arcgis_path)
listings = (
listings.join(
postcode_mapping,
left_on="_original_postcode",
right_on="old_postcode",
how="left",
)
.with_columns(
pl.coalesce("new_postcode", "_original_postcode", "Postcode").alias(
"Postcode"
),
)
.drop("new_postcode", strict=False)
.with_columns(
normalize_postcode_key(pl.col("Postcode")).alias("_listing_match_postcode"),
)
)
return listings
def _load_property_candidates(
properties_path: Path, listing_postcodes: list[str]
) -> pl.DataFrame:
schema = pl.scan_parquet(properties_path).collect_schema()
columns = [
column for column in PROPERTY_CANDIDATE_COLUMNS if column in schema.names()
]
missing = sorted(
set(
[
"Address per Property Register",
"Postcode",
"Property type",
"Total floor area (sqm)",
]
)
- set(columns)
)
if missing:
raise ValueError(f"{properties_path} is missing property columns: {missing}")
return (
pl.scan_parquet(properties_path)
.select(columns)
.with_columns(
normalize_postcode_key(pl.col("Postcode")).alias("_match_postcode")
)
.filter(pl.col("_match_postcode").is_in(listing_postcodes))
.with_columns(
normalize_address_key(pl.col("Address per Property Register")).alias(
"_match_register_address"
),
normalize_address_key(pl.col("Address per EPC")).alias("_match_epc_address")
if "Address per EPC" in columns
else pl.lit(None, dtype=pl.Utf8).alias("_match_epc_address"),
)
.filter(
pl.col("_match_register_address").is_not_null()
| pl.col("_match_epc_address").is_not_null()
)
.with_row_index("_property_row")
.collect(engine="streaming")
)
def _property_candidates_by_postcode(
candidates: pl.DataFrame,
) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _numbers_compatible(left: str | None, right: str | None) -> bool:
if not left or not right:
return False
left_nums = set(_NUMBER_RE.findall(left))
right_nums = set(_NUMBER_RE.findall(right))
smaller, larger = (
(left_nums, right_nums)
if len(left_nums) <= len(right_nums)
else (right_nums, left_nums)
)
if not smaller and larger:
return False
return smaller.issubset(larger)
def _has_number(address: str | None) -> bool:
return bool(address and _NUMBER_RE.search(address))
def _ratio_bonus(
left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
if left is None or right is None:
return 0.0
try:
left_f = float(left)
right_f = float(right)
except (TypeError, ValueError):
return 0.0
if left_f <= 0 or right_f <= 0:
return 0.0
rel = abs(left_f - right_f) / max(left_f, right_f)
if rel > pct:
return 0.0
return cap * (1.0 - rel / pct)
def _rooms_bonus(left: int | None, right: int | None) -> float:
if left is None or right is None:
return 0.0
try:
diff = abs(int(left) - int(right))
except (TypeError, ValueError):
return 0.0
if diff == 0:
return 4.0
if diff == 1:
return 2.0
return 0.0
def _enum_bonus(
left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
if not left or not right:
return 0.0
return exact if left == right else mismatch
def _address_score(query: str, candidate: str | None) -> int:
if not candidate:
return 0
return max(
fuzz.token_set_ratio(query, candidate),
fuzz.token_sort_ratio(query, candidate),
)
def _best_property_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict, str]] = []
for candidate in candidates:
register_address = candidate.get("_match_register_address")
epc_address = candidate.get("_match_epc_address")
if listing_has_numbers and not (
_numbers_compatible(query, register_address)
or _numbers_compatible(query, epc_address)
):
continue
register_score = _address_score(query, register_address)
epc_score = _address_score(query, epc_address)
base_score = max(register_score, epc_score)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("Property type"),
candidate.get("Property type"),
exact=7.0,
mismatch=-8.0,
)
score += _enum_bonus(
listing.get("Leasehold/Freehold"),
candidate.get("Leasehold/Freehold"),
exact=3.0,
mismatch=-3.0,
)
score += _ratio_bonus(
listing.get("Total floor area (sqm)"),
candidate.get("Total floor area (sqm)"),
pct=0.15,
cap=8.0,
)
score += _rooms_bonus(
listing.get("Number of bedrooms & living rooms"),
candidate.get("Number of bedrooms & living rooms"),
)
score += _ratio_bonus(
listing.get("Asking price"),
candidate.get("Estimated current price")
or candidate.get("Last known price"),
pct=0.25,
cap=3.0,
)
matched_address = (
"Address per Property Register"
if register_score >= epc_score
else "Address per EPC"
)
scored.append((score, base_score, candidate, matched_address))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < PROPERTY_MATCH_MIN_MARGIN:
return None
return {
"_listing_idx": listing["_listing_idx"],
"_property_row": top[2]["_property_row"],
"Historical property match score": round(top[0], 1),
"Historical property address score": top[1],
"Historical property match margin": round(margin, 1),
"Historical property match field": top[3],
"Historical property match status": "matched",
}
def _match_properties(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
schema = {
"_listing_idx": pl.UInt32,
"_property_row": pl.UInt32,
"Historical property match score": pl.Float32,
"Historical property address score": pl.Int32,
"Historical property match margin": pl.Float32,
"Historical property match field": pl.Utf8,
"Historical property match status": pl.Utf8,
}
if candidates.is_empty():
return pl.DataFrame(schema=schema)
buckets = _property_candidates_by_postcode(candidates)
matches = []
for listing in tqdm(
listings.iter_rows(named=True),
total=listings.height,
desc="Matching historical properties",
):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_property_candidate(listing, buckets.get(postcode, []))
if match is not None:
matches.append(match)
if not matches:
return pl.DataFrame(schema=schema)
return pl.DataFrame(matches, schema=schema)
def _prefix_columns(df: pl.DataFrame, columns: list[str], prefix: str) -> pl.DataFrame:
rename = {column: f"{prefix}{column}" for column in columns if column in df.columns}
return df.rename(rename)
def _ensure_prefixed_columns(
df: pl.DataFrame, columns: list[str], prefix: str
) -> pl.DataFrame:
missing_exprs = [
pl.lit(None, dtype=COLUMN_DTYPES.get(column, pl.Utf8)).alias(
f"{prefix}{column}"
)
for column in columns
if f"{prefix}{column}" not in df.columns
]
if not missing_exprs:
return df
return df.with_columns(missing_exprs)
def _property_match_frame(
matches: pl.DataFrame, candidates: pl.DataFrame
) -> pl.DataFrame:
if matches.is_empty():
return matches
selected_columns = [
"_property_row",
*[
column
for column in PROPERTY_CANDIDATE_COLUMNS
if column in candidates.columns
],
]
matched = matches.join(
candidates.select(selected_columns), on="_property_row", how="left"
)
return _prefix_columns(
matched,
[column for column in PROPERTY_CANDIDATE_COLUMNS if column in matched.columns],
"_property_",
)
def _canonical_epc_property_type_expr() -> pl.Expr:
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
["NO DATA!", "Not Recorded"]
)
has_epc = pl.col("epc_property_type").is_not_null()
is_house = pl.col("epc_property_type") == "House"
return (
pl.when(has_epc & is_house & ~bad_built_form)
.then(pl.col("built_form"))
.when(has_epc)
.then(pl.col("epc_property_type"))
.otherwise(None)
.replace(
{
"Flat": "Flats/Maisonettes",
"Maisonette": "Flats/Maisonettes",
"End-Terrace": "Terraced",
"Mid-Terrace": "Terraced",
"Enclosed End-Terrace": "Terraced",
"Enclosed Mid-Terrace": "Terraced",
"Bungalow": "Other",
"Park home": "Other",
"House": "Other",
}
)
)
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
return (
pl.col(column)
.cast(pl.Utf8)
.str.replace("England and Wales: ", "")
.str.replace(" onwards", "")
.str.extract(r"(\d{4})", 1)
.cast(pl.UInt16, strict=False)
)
def _fractional_year_expr(column: str) -> pl.Expr:
return (
pl.col(column).dt.year().cast(pl.Float32)
+ (pl.col(column).dt.month().cast(pl.Float32) - 1.0) / 12.0
)
def _load_epc_candidates(
epc_path: Path, listing_postcodes: list[str], temp_dir: Path
) -> pl.DataFrame:
epc_base = _scan_epc_certificates(epc_path, temp_dir).with_columns(
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
normalize_postcode_key(pl.col("epc_postcode")).alias("_epc_match_postcode"),
)
epc = (
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
.sort("inspection_date", descending=True)
.group_by("_epc_match_address", "_epc_match_postcode")
.first()
.with_columns(
_canonical_epc_property_type_expr().alias("_epc_canonical_property_type"),
_construction_year_expr().alias("Construction year"),
pl.when(pl.col("current_energy_rating").is_in(EPC_RATING_VALUES))
.then(pl.col("current_energy_rating"))
.otherwise(None)
.alias("Current energy rating"),
pl.when(pl.col("potential_energy_rating").is_in(EPC_RATING_VALUES))
.then(pl.col("potential_energy_rating"))
.otherwise(None)
.alias("Potential energy rating"),
pl.col("total_floor_area").alias("Total floor area (sqm)"),
pl.col("number_habitable_rooms").alias("Number of bedrooms & living rooms"),
pl.col("floor_height").alias("Interior height (m)"),
pl.col("epc_address").alias("Address per EPC"),
)
.drop("tenure", strict=False)
)
social_tenure = (
epc_base.filter(pl.col("_epc_match_postcode").is_in(listing_postcodes))
.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
.select("_epc_match_address", "_epc_match_postcode")
.unique()
.with_columns(pl.lit("Yes").alias("Former council house"))
)
return (
epc.join(
social_tenure,
on=["_epc_match_address", "_epc_match_postcode"],
how="left",
)
.with_columns(pl.col("Former council house").fill_null("No"))
.filter(pl.col("_epc_match_address").is_not_null())
.with_row_index("_epc_row")
.select(
"_epc_row",
"_epc_match_address",
"_epc_match_postcode",
"_epc_canonical_property_type",
*EPC_ENRICHMENT_COLUMNS,
)
.collect(engine="streaming")
)
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_epc_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _best_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict]] = []
for candidate in candidates:
address = candidate.get("_epc_match_address")
if listing_has_numbers and not _numbers_compatible(query, address):
continue
base_score = _address_score(query, address)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("Property type"),
candidate.get("_epc_canonical_property_type"),
exact=6.0,
mismatch=-6.0,
)
score += _ratio_bonus(
listing.get("Total floor area (sqm)"),
candidate.get("Total floor area (sqm)"),
pct=0.12,
cap=8.0,
)
score += _rooms_bonus(
listing.get("Number of bedrooms & living rooms"),
candidate.get("Number of bedrooms & living rooms"),
)
scored.append((score, base_score, candidate))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
EPC_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < EPC_MATCH_MIN_MARGIN:
return None
return {
"_listing_idx": listing["_listing_idx"],
"_epc_row": top[2]["_epc_row"],
"EPC match score": round(top[0], 1),
"EPC address score": top[1],
"EPC match margin": round(margin, 1),
"EPC match status": "matched",
}
def _match_epc(listings: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
schema = {
"_listing_idx": pl.UInt32,
"_epc_row": pl.UInt32,
"EPC match score": pl.Float32,
"EPC address score": pl.Int32,
"EPC match margin": pl.Float32,
"EPC match status": pl.Utf8,
}
if candidates.is_empty():
return pl.DataFrame(schema=schema)
buckets = _epc_candidates_by_postcode(candidates)
matches = []
for listing in tqdm(
listings.iter_rows(named=True),
total=listings.height,
desc="Matching EPC certificates",
):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_epc_candidate(listing, buckets.get(postcode, []))
if match is not None:
matches.append(match)
if not matches:
return pl.DataFrame(schema=schema)
return pl.DataFrame(matches, schema=schema)
def _epc_match_frame(matches: pl.DataFrame, candidates: pl.DataFrame) -> pl.DataFrame:
if matches.is_empty():
return matches
matched = matches.join(
candidates.select("_epc_row", *EPC_ENRICHMENT_COLUMNS),
on="_epc_row",
how="left",
)
return _prefix_columns(
matched,
[column for column in EPC_ENRICHMENT_COLUMNS if column in matched.columns],
"_epc_",
)
def _join_postcode_features(
listings: pl.DataFrame, postcode_features_path: Path
) -> pl.DataFrame:
postcode_features = pl.scan_parquet(postcode_features_path).collect(
engine="streaming"
)
return listings.join(
postcode_features, on="Postcode", how="left", suffix="_postcode"
)
def _coalesce_feature_columns(df: pl.DataFrame) -> pl.DataFrame:
with_columns: list[pl.Expr] = [
pl.lit(ENRICHMENT_VERSION, dtype=pl.UInt16).alias(
"Actual listing enrichment version"
),
_coalesce_non_empty(
"_epc_Address per EPC",
"_property_Address per EPC",
).alias("Address per EPC"),
pl.when(pl.col("Property type").is_in(PROPERTY_TYPE_VALUES))
.then(pl.col("Property type"))
.otherwise(pl.col("_property_Property type"))
.alias("Property type"),
pl.when(pl.col("Leasehold/Freehold").is_in(TENURE_VALUES))
.then(pl.col("Leasehold/Freehold"))
.otherwise(pl.col("_property_Leasehold/Freehold"))
.alias("Leasehold/Freehold"),
pl.coalesce(
_valid_number_expr("Total floor area (sqm)"),
_valid_number_expr("_epc_Total floor area (sqm)"),
_valid_number_expr("_property_Total floor area (sqm)"),
).alias("Total floor area (sqm)"),
pl.when(pl.col("Number of bedrooms & living rooms") > 0)
.then(pl.col("Number of bedrooms & living rooms"))
.otherwise(
pl.coalesce(
pl.col("_epc_Number of bedrooms & living rooms"),
pl.col("_property_Number of bedrooms & living rooms"),
)
)
.cast(pl.Int32, strict=False)
.alias("Number of bedrooms & living rooms"),
pl.col("Asking price").alias("Estimated current price"),
pl.col("Asking price").alias("Last known price"),
_coalesce_non_empty(
"_epc_Current energy rating",
"_property_Current energy rating",
).alias("Current energy rating"),
_coalesce_non_empty(
"_epc_Potential energy rating",
"_property_Potential energy rating",
).alias("Potential energy rating"),
pl.coalesce(
_valid_number_expr("_epc_Interior height (m)"),
_valid_number_expr("_property_Interior height (m)"),
).alias("Interior height (m)"),
pl.coalesce(
pl.col("_epc_Construction year"),
pl.col("_property_Construction year"),
)
.cast(pl.UInt16, strict=False)
.alias("Construction year"),
_coalesce_non_empty(
"_epc_Former council house",
"_property_Former council house",
)
.fill_null("No")
.alias("Former council house"),
pl.col("_property_Is construction date approximate").alias(
"Is construction date approximate"
),
pl.col("_property_Listed building").fill_null("No").alias("Listed building"),
pl.col("_property_Estimated monthly rent").alias("Estimated monthly rent"),
pl.col("_property_Street tree density percentile").alias(
"Street tree density percentile"
),
_fractional_year_expr("_property_Date of last transaction").alias(
"Date of last transaction"
),
]
df = df.with_columns(with_columns)
df = df.with_columns(
pl.when(
pl.col("Asking price").is_not_null()
& pl.col("Total floor area (sqm)").is_not_null()
& (pl.col("Total floor area (sqm)") > 0)
)
.then((pl.col("Asking price") / pl.col("Total floor area (sqm)")).round(0))
.otherwise(None)
.cast(pl.Int32, strict=False)
.alias("Asking price per sqm"),
).with_columns(
pl.col("Asking price per sqm").alias("Est. price per sqm"),
pl.col("Asking price per sqm").alias("Price per sqm"),
)
return df
def _drop_internal_columns(df: pl.DataFrame) -> pl.DataFrame:
internal_prefixes = ("_property_", "_epc_")
internal_exact = {
"_listing_idx",
"_listing_match_address",
"_listing_match_postcode",
"_original_postcode",
"_property_row",
"_epc_row",
"lat_postcode",
"lon_postcode",
}
drop_cols = [
column
for column in df.columns
if column in internal_exact or column.startswith(internal_prefixes)
]
return df.drop(drop_cols, strict=False)
def build_enriched_actual_listings(
listings_path: Path,
properties_path: Path,
postcode_features_path: Path,
arcgis_path: Path,
output_path: Path,
*,
epc_path: Path | None = None,
) -> pl.DataFrame:
print(f"Loading listings from {listings_path}...")
listings = _read_listings(listings_path, arcgis_path)
listing_postcodes = (
listings.select("_listing_match_postcode")
.drop_nulls()
.unique()
.to_series()
.to_list()
)
print(f"Listings: {listings.height}; unique postcodes: {len(listing_postcodes)}")
print(f"Loading property candidates from {properties_path}...")
property_candidates = _load_property_candidates(properties_path, listing_postcodes)
print(f"Property candidates: {property_candidates.height}")
property_matches = _match_properties(listings, property_candidates)
print(f"Historical property matches: {property_matches.height}")
property_match_frame = _property_match_frame(property_matches, property_candidates)
enriched = _join_postcode_features(listings, postcode_features_path)
if not property_match_frame.is_empty():
enriched = enriched.join(property_match_frame, on="_listing_idx", how="left")
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("Historical property match status")
)
if epc_path is not None:
with tempfile.TemporaryDirectory(
prefix="actual_listing_epc_", dir=local_tmp_dir()
) as tmpdir:
print(f"Loading EPC candidates from {epc_path}...")
epc_candidates = _load_epc_candidates(
epc_path, listing_postcodes, Path(tmpdir)
)
print(f"EPC candidates: {epc_candidates.height}")
epc_matches = _match_epc(listings, epc_candidates)
print(f"EPC matches: {epc_matches.height}")
epc_match_frame = _epc_match_frame(epc_matches, epc_candidates)
if not epc_match_frame.is_empty():
enriched = enriched.join(epc_match_frame, on="_listing_idx", how="left")
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
)
else:
enriched = enriched.with_columns(
pl.lit(None, dtype=pl.Utf8).alias("EPC match status")
)
enriched = _ensure_prefixed_columns(
enriched, PROPERTY_CANDIDATE_COLUMNS, "_property_"
)
enriched = _ensure_prefixed_columns(enriched, EPC_ENRICHMENT_COLUMNS, "_epc_")
enriched = _coalesce_feature_columns(enriched)
enriched = _drop_internal_columns(enriched)
output_path.parent.mkdir(parents=True, exist_ok=True)
enriched.write_parquet(output_path)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(
f"Wrote {enriched.height} enriched listings to {output_path} ({size_mb:.1f} MB)"
)
return enriched
def main() -> None:
parser = argparse.ArgumentParser(
description="Build a pre-enriched actual-listings parquet for the server"
)
parser.add_argument(
"--listings",
type=Path,
default=Path("finder/data/online_listings_buy.parquet"),
help="Input scraped listings parquet",
)
parser.add_argument(
"--properties",
type=Path,
default=Path("property-data/properties.parquet"),
help="Historical properties parquet",
)
parser.add_argument(
"--postcode-features",
type=Path,
default=Path("property-data/postcode.parquet"),
help="Postcode feature parquet",
)
parser.add_argument(
"--arcgis",
type=Path,
default=Path("property-data/arcgis_data.parquet"),
help="ArcGIS/NSPL postcode parquet used for terminated-postcode remapping",
)
parser.add_argument(
"--epc",
type=Path,
default=Path("manual-data/domestic-csv.zip"),
help="Optional EPC certificates CSV/zip for direct listing-to-EPC fuzzy matching",
)
parser.add_argument(
"--no-epc",
action="store_true",
help="Skip direct EPC matching even when --epc exists",
)
parser.add_argument(
"--output",
type=Path,
default=Path("finder/data/online_listings_buy_enriched.parquet"),
help="Output enriched listings parquet",
)
args = parser.parse_args()
epc_path = None if args.no_epc else args.epc
if epc_path is not None and not epc_path.exists():
print(
f"EPC source not found at {epc_path}; continuing without direct EPC matching"
)
epc_path = None
build_enriched_actual_listings(
listings_path=args.listings,
properties_path=args.properties,
postcode_features_path=args.postcode_features,
arcgis_path=args.arcgis,
epc_path=epc_path,
output_path=args.output,
)
if __name__ == "__main__":
main()