2323 lines
81 KiB
Python
2323 lines
81 KiB
Python
import argparse
|
|
import re
|
|
import tempfile
|
|
from dataclasses import dataclass
|
|
from typing import Literal
|
|
|
|
import numpy as np
|
|
import polars as pl
|
|
from pathlib import Path
|
|
|
|
import pyogrio
|
|
from pyproj import Transformer
|
|
from scipy.spatial import cKDTree
|
|
from shapely import from_wkb, points
|
|
from shapely.geometry.base import BaseGeometry
|
|
from shapely.strtree import STRtree
|
|
from thefuzz import fuzz
|
|
|
|
from pipeline.local_temp import local_tmp_dir
|
|
from pipeline.transform.join_epc_pp import _scan_epc_certificates
|
|
from pipeline.utils.fuzzy_join import (
|
|
normalize_address_key,
|
|
normalize_postcode_key,
|
|
)
|
|
from pipeline.utils.postcode_mapping import build_postcode_mapping
|
|
|
|
MIN_FLOOR_AREA_M2 = 10
|
|
CONSERVATION_AREA_FEATURE = "Within conservation area"
|
|
LISTED_BUILDING_FEATURE = "Listed building"
|
|
LISTED_BUILDING_MATCH_RADIUS_M = 250.0
|
|
LISTED_BUILDING_NEAREST_POSTCODES = 3
|
|
LISTED_BUILDING_MIN_MATCH_SCORE = 95
|
|
PLANNING_DATA_CONSERVATION_AREA_DATASET = "conservation-area"
|
|
|
|
_IOD_PERCENTILE_COLUMNS = [
|
|
"Education, Skills and Training Score",
|
|
"Income Score (rate)",
|
|
"Employment Score (rate)",
|
|
"Health Deprivation and Disability Score",
|
|
"Indoors Sub-domain Score",
|
|
"Outdoors Sub-domain Score",
|
|
]
|
|
|
|
|
|
_AREA_COLUMNS = [
|
|
"Postcode",
|
|
"lat",
|
|
"lon",
|
|
# Runtime provenance for deciding whether missing coordinates are skippable.
|
|
"ctry25cd",
|
|
# Keyed lookup for postcode-level side tables (e.g. crime time series).
|
|
"lsoa21",
|
|
# Deprivation
|
|
"Income Score",
|
|
"Employment Score",
|
|
"Education, Skills and Training Score",
|
|
"Health Deprivation and Disability Score",
|
|
"Housing Conditions Score",
|
|
"Air Quality and Road Safety Score",
|
|
# Ethnicity
|
|
"% South Asian",
|
|
"% East Asian",
|
|
"% Black",
|
|
"% Mixed",
|
|
"% White",
|
|
"% Other",
|
|
# Crime
|
|
"Anti-social behaviour (avg/yr)",
|
|
"Violence and sexual offences (avg/yr)",
|
|
"Criminal damage and arson (avg/yr)",
|
|
"Burglary (avg/yr)",
|
|
"Vehicle crime (avg/yr)",
|
|
"Robbery (avg/yr)",
|
|
"Other theft (avg/yr)",
|
|
"Shoplifting (avg/yr)",
|
|
"Drugs (avg/yr)",
|
|
"Possession of weapons (avg/yr)",
|
|
"Public order (avg/yr)",
|
|
"Bicycle theft (avg/yr)",
|
|
"Theft from the person (avg/yr)",
|
|
"Other crime (avg/yr)",
|
|
"Serious crime (avg/yr)",
|
|
"Minor crime (avg/yr)",
|
|
"Serious crime per 1k residents (avg/yr)",
|
|
"Minor crime per 1k residents (avg/yr)",
|
|
# Amenities
|
|
"Number of restaurants within 2km",
|
|
"Number of grocery shops and supermarkets within 2km",
|
|
# Environment
|
|
"Noise (dB)",
|
|
"Max available download speed (Mbps)",
|
|
CONSERVATION_AREA_FEATURE,
|
|
# Schools
|
|
"Good+ primary schools within 5km",
|
|
"Good+ secondary schools within 5km",
|
|
"Good+ primary schools within 2km",
|
|
"Good+ secondary schools within 2km",
|
|
"Outstanding primary schools within 5km",
|
|
"Outstanding secondary schools within 5km",
|
|
"Outstanding primary schools within 2km",
|
|
"Outstanding secondary schools within 2km",
|
|
# Demographics
|
|
"Median age",
|
|
# Politics
|
|
"Voter turnout (%)",
|
|
"% Labour",
|
|
"% Conservative",
|
|
"% Liberal Democrat",
|
|
"% Reform UK",
|
|
"% Green",
|
|
"% Other parties",
|
|
]
|
|
|
|
|
|
_DYNAMIC_POI_DISTANCE_RE = re.compile(r"^Distance to nearest amenity \(.+\) \(km\)$")
|
|
_DYNAMIC_POI_COUNT_RE = re.compile(r"^Number of amenities \(.+\) within (2|5)km$")
|
|
TREE_DENSITY_FEATURE = "Street tree density percentile"
|
|
_POSTCODE_TREE_DENSITY_PERCENTILE_RE = re.compile(
|
|
r"^Tree canopy density percentile within \d+m$"
|
|
)
|
|
_RENT_SOURCE_UNAVAILABLE_LADS = {
|
|
# ONS PIPR does not publish LAD-level private-rent estimates for these
|
|
# small authorities. Keep rent null there, but fail on any other LAD miss.
|
|
"E06000053": "Isles of Scilly",
|
|
"E09000001": "City of London",
|
|
}
|
|
_NUMBER_RE = re.compile(r"\d+")
|
|
_LISTED_NAME_STOP_WORDS = {
|
|
"A",
|
|
"AN",
|
|
"AND",
|
|
"AT",
|
|
"BY",
|
|
"IN",
|
|
"OF",
|
|
"ON",
|
|
"THE",
|
|
"TO",
|
|
"WITH",
|
|
}
|
|
|
|
|
|
def _is_dynamic_poi_metric_column(column: str) -> bool:
|
|
return bool(
|
|
_DYNAMIC_POI_DISTANCE_RE.match(column) or _DYNAMIC_POI_COUNT_RE.match(column)
|
|
)
|
|
|
|
|
|
def _numbers_compatible(left: str, right: str) -> bool:
|
|
"""Require address/list-entry numbers to agree when either side has numbers."""
|
|
left_nums = set(_NUMBER_RE.findall(left))
|
|
right_nums = set(_NUMBER_RE.findall(right))
|
|
smaller, larger = (
|
|
(left_nums, right_nums)
|
|
if len(left_nums) <= len(right_nums)
|
|
else (right_nums, left_nums)
|
|
)
|
|
if not smaller and larger:
|
|
return False
|
|
return smaller.issubset(larger)
|
|
|
|
|
|
def _listed_candidate_schema() -> dict[str, pl.DataType]:
|
|
return {
|
|
"postcode": pl.Utf8,
|
|
"_listed_match_name": pl.Utf8,
|
|
"_listed_grade": pl.Utf8,
|
|
"_listed_entry": pl.Int64,
|
|
}
|
|
|
|
|
|
def _empty_listed_candidates() -> pl.DataFrame:
|
|
return pl.DataFrame(schema=_listed_candidate_schema())
|
|
|
|
|
|
def _empty_listed_property_flags() -> pl.DataFrame:
|
|
return pl.DataFrame(
|
|
schema={
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
LISTED_BUILDING_FEATURE: pl.Utf8,
|
|
}
|
|
)
|
|
|
|
|
|
def _is_matchable_listed_name(name_key: str | None) -> bool:
|
|
if not name_key:
|
|
return False
|
|
if _NUMBER_RE.search(name_key):
|
|
return True
|
|
substantive_tokens = [
|
|
token
|
|
for token in name_key.split()
|
|
if token not in _LISTED_NAME_STOP_WORDS and len(token) >= 3
|
|
]
|
|
return len(substantive_tokens) >= 2
|
|
|
|
|
|
def _load_listed_building_points(listed_buildings_path: Path) -> pl.DataFrame:
|
|
"""Load Historic England NHLE listed-building point attributes."""
|
|
columns = ["ListEntry", "Name", "Grade", "Easting", "Northing"]
|
|
info = pyogrio.read_info(listed_buildings_path)
|
|
geometry_type = str(info.get("geometry_type") or "")
|
|
if "Point" not in geometry_type:
|
|
raise ValueError(
|
|
f"Expected listed-building point data, got geometry {geometry_type!r}"
|
|
)
|
|
_, table = pyogrio.read_arrow(
|
|
listed_buildings_path,
|
|
columns=columns,
|
|
read_geometry=False,
|
|
)
|
|
df = pl.from_arrow(table)
|
|
missing = sorted(set(columns) - set(df.columns))
|
|
if missing:
|
|
raise ValueError(
|
|
f"{listed_buildings_path} is missing listed-building columns: {missing}"
|
|
)
|
|
return (
|
|
df.select(
|
|
pl.col("ListEntry").cast(pl.Int64),
|
|
pl.col("Name").cast(pl.Utf8),
|
|
pl.col("Grade").cast(pl.Utf8),
|
|
pl.col("Easting").cast(pl.Float64),
|
|
pl.col("Northing").cast(pl.Float64),
|
|
)
|
|
.drop_nulls(["Name", "Easting", "Northing"])
|
|
.with_columns(normalize_address_key(pl.col("Name")).alias("_listed_match_name"))
|
|
.filter(pl.col("_listed_match_name").is_not_null())
|
|
)
|
|
|
|
|
|
def _postcode_listed_building_candidates(
|
|
listed_points: pl.DataFrame,
|
|
active_postcodes: pl.DataFrame,
|
|
*,
|
|
nearest_postcodes: int = LISTED_BUILDING_NEAREST_POSTCODES,
|
|
max_distance_m: float = LISTED_BUILDING_MATCH_RADIUS_M,
|
|
) -> pl.DataFrame:
|
|
"""Assign each listed-building point to nearby active postcode candidates."""
|
|
if listed_points.is_empty() or active_postcodes.is_empty():
|
|
return _empty_listed_candidates()
|
|
|
|
required_postcode_cols = {"postcode", "east1m", "north1m"}
|
|
missing = sorted(required_postcode_cols - set(active_postcodes.columns))
|
|
if missing:
|
|
raise ValueError(f"Active postcode data missing required columns: {missing}")
|
|
|
|
required_listed_cols = {
|
|
"_listed_match_name",
|
|
"Grade",
|
|
"ListEntry",
|
|
"Easting",
|
|
"Northing",
|
|
}
|
|
missing = sorted(required_listed_cols - set(listed_points.columns))
|
|
if missing:
|
|
raise ValueError(f"Listed-building data missing required columns: {missing}")
|
|
|
|
postcodes = active_postcodes.drop_nulls(["postcode", "east1m", "north1m"])
|
|
postcodes = postcodes.filter(
|
|
pl.col("east1m").is_finite() & pl.col("north1m").is_finite()
|
|
)
|
|
listed = listed_points.drop_nulls(["_listed_match_name", "Easting", "Northing"])
|
|
listed = listed.filter(
|
|
pl.col("Easting").is_finite() & pl.col("Northing").is_finite()
|
|
)
|
|
if postcodes.is_empty() or listed.is_empty():
|
|
return _empty_listed_candidates()
|
|
|
|
postcode_coords = np.column_stack(
|
|
[postcodes["east1m"].to_numpy(), postcodes["north1m"].to_numpy()]
|
|
)
|
|
listed_coords = np.column_stack(
|
|
[listed["Easting"].to_numpy(), listed["Northing"].to_numpy()]
|
|
)
|
|
k = max(1, min(nearest_postcodes, postcodes.height))
|
|
distances, indices = cKDTree(postcode_coords).query(
|
|
listed_coords,
|
|
k=k,
|
|
distance_upper_bound=max_distance_m,
|
|
)
|
|
if k == 1:
|
|
distances = distances[:, np.newaxis]
|
|
indices = indices[:, np.newaxis]
|
|
|
|
postcode_values = postcodes["postcode"].to_list()
|
|
listed_names = listed["_listed_match_name"].to_list()
|
|
listed_grades = listed["Grade"].to_list()
|
|
listed_entries = listed["ListEntry"].to_list()
|
|
|
|
rows: list[tuple[str, str, str | None, int | None]] = []
|
|
for listed_idx in range(listed.height):
|
|
name_key = listed_names[listed_idx]
|
|
if not _is_matchable_listed_name(name_key):
|
|
continue
|
|
seen_postcodes: set[str] = set()
|
|
for distance, postcode_idx in zip(distances[listed_idx], indices[listed_idx]):
|
|
if not np.isfinite(distance) or postcode_idx >= postcodes.height:
|
|
continue
|
|
postcode = postcode_values[int(postcode_idx)]
|
|
if postcode in seen_postcodes:
|
|
continue
|
|
seen_postcodes.add(postcode)
|
|
rows.append(
|
|
(
|
|
postcode,
|
|
name_key,
|
|
listed_grades[listed_idx],
|
|
listed_entries[listed_idx],
|
|
)
|
|
)
|
|
|
|
if not rows:
|
|
return _empty_listed_candidates()
|
|
|
|
return (
|
|
pl.DataFrame(
|
|
rows,
|
|
schema=[
|
|
"postcode",
|
|
"_listed_match_name",
|
|
"_listed_grade",
|
|
"_listed_entry",
|
|
],
|
|
orient="row",
|
|
)
|
|
.cast(_listed_candidate_schema())
|
|
.unique(["postcode", "_listed_match_name", "_listed_entry"])
|
|
)
|
|
|
|
|
|
def _matched_listed_building_flags(
|
|
properties: pl.LazyFrame,
|
|
listed_candidates: pl.DataFrame,
|
|
*,
|
|
min_score: int = LISTED_BUILDING_MIN_MATCH_SCORE,
|
|
) -> pl.DataFrame:
|
|
"""Return property keys that conservatively match an NHLE listed entry."""
|
|
if listed_candidates.is_empty():
|
|
return _empty_listed_property_flags()
|
|
|
|
candidate_postcodes = listed_candidates.select("postcode").unique()
|
|
property_candidates = (
|
|
properties.select("postcode", "pp_address", "epc_address")
|
|
.join(candidate_postcodes.lazy(), on="postcode", how="semi")
|
|
.with_columns(
|
|
normalize_address_key(pl.col("pp_address")).alias("_pp_match_address"),
|
|
normalize_address_key(pl.col("epc_address")).alias("_epc_match_address"),
|
|
)
|
|
.filter(
|
|
pl.col("pp_address").is_not_null()
|
|
& (
|
|
pl.col("_pp_match_address").is_not_null()
|
|
| pl.col("_epc_match_address").is_not_null()
|
|
)
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
if property_candidates.is_empty():
|
|
return _empty_listed_property_flags()
|
|
|
|
listed_by_postcode: dict[str, list[str]] = {}
|
|
for postcode, name in listed_candidates.select(
|
|
"postcode", "_listed_match_name"
|
|
).iter_rows():
|
|
if postcode and name:
|
|
listed_by_postcode.setdefault(postcode, []).append(name)
|
|
|
|
matches: list[tuple[str, str, str]] = []
|
|
for row in property_candidates.iter_rows(named=True):
|
|
postcode = row["postcode"]
|
|
listed_names = listed_by_postcode.get(postcode)
|
|
if not listed_names:
|
|
continue
|
|
|
|
address_keys = []
|
|
for col in ("_pp_match_address", "_epc_match_address"):
|
|
value = row.get(col)
|
|
if value and value not in address_keys:
|
|
address_keys.append(value)
|
|
|
|
matched = False
|
|
for address_key in address_keys:
|
|
for listed_name in listed_names:
|
|
if not _numbers_compatible(address_key, listed_name):
|
|
continue
|
|
if fuzz.token_set_ratio(address_key, listed_name) >= min_score:
|
|
matched = True
|
|
break
|
|
if matched:
|
|
break
|
|
|
|
if matched:
|
|
matches.append((postcode, row["pp_address"], "Yes"))
|
|
|
|
if not matches:
|
|
return _empty_listed_property_flags()
|
|
|
|
return (
|
|
pl.DataFrame(
|
|
matches,
|
|
schema=["postcode", "pp_address", LISTED_BUILDING_FEATURE],
|
|
orient="row",
|
|
)
|
|
.cast(
|
|
{
|
|
"postcode": pl.Utf8,
|
|
"pp_address": pl.Utf8,
|
|
LISTED_BUILDING_FEATURE: pl.Utf8,
|
|
}
|
|
)
|
|
.unique(["postcode", "pp_address"])
|
|
)
|
|
|
|
|
|
def _listed_building_flags(
|
|
properties: pl.LazyFrame,
|
|
active_postcodes: pl.DataFrame,
|
|
listed_buildings_path: Path,
|
|
) -> pl.DataFrame:
|
|
print(f"Loading listed-building points from {listed_buildings_path}...")
|
|
listed_points = _load_listed_building_points(listed_buildings_path)
|
|
print(f"Loaded {listed_points.height} listed-building point records")
|
|
listed_candidates = _postcode_listed_building_candidates(
|
|
listed_points, active_postcodes
|
|
)
|
|
print(
|
|
"Matching listed-building names to property addresses across "
|
|
f"{listed_candidates['postcode'].n_unique()} nearby postcodes..."
|
|
)
|
|
flags = _matched_listed_building_flags(properties, listed_candidates)
|
|
print(f"Matched {flags.height} property addresses to listed-building entries")
|
|
return flags
|
|
|
|
|
|
def _normalise_crs(crs: object | None) -> str:
|
|
return str(crs) if crs else "EPSG:4326"
|
|
|
|
|
|
def _geometry_column(metadata: dict, column_names: list[str]) -> str:
|
|
geometry_name = metadata.get("geometry_name")
|
|
if geometry_name:
|
|
return str(geometry_name)
|
|
for name in ("wkb_geometry", "geometry", "geom"):
|
|
if name in column_names:
|
|
return name
|
|
return column_names[-1]
|
|
|
|
|
|
def _column_values(table, column: str, default: object = None) -> list[object]:
|
|
if column not in table.column_names:
|
|
return [default] * table.num_rows
|
|
return table[column].combine_chunks().to_pylist()
|
|
|
|
|
|
def _is_planning_conservation_area_record(dataset: object) -> bool:
|
|
return (
|
|
dataset is None
|
|
or str(dataset).strip().casefold() == PLANNING_DATA_CONSERVATION_AREA_DATASET
|
|
)
|
|
|
|
|
|
def _is_current_planning_record(end_date: object) -> bool:
|
|
if end_date is None:
|
|
return True
|
|
if isinstance(end_date, str):
|
|
return end_date.strip() == ""
|
|
return False
|
|
|
|
|
|
def _load_conservation_area_geometries(
|
|
conservation_areas_path: Path,
|
|
) -> tuple[list[BaseGeometry], str]:
|
|
metadata, table = pyogrio.read_arrow(conservation_areas_path)
|
|
geometry_name = _geometry_column(metadata, table.column_names)
|
|
datasets = _column_values(table, "dataset")
|
|
end_dates = _column_values(table, "end-date")
|
|
geometries = []
|
|
skipped_other_dataset = 0
|
|
skipped_ended = 0
|
|
skipped_non_polygon = 0
|
|
skipped_empty = 0
|
|
for dataset, end_date, geom in zip(
|
|
datasets,
|
|
end_dates,
|
|
from_wkb(table[geometry_name].combine_chunks().to_pylist()),
|
|
strict=True,
|
|
):
|
|
if not _is_planning_conservation_area_record(dataset):
|
|
skipped_other_dataset += 1
|
|
continue
|
|
if not _is_current_planning_record(end_date):
|
|
skipped_ended += 1
|
|
continue
|
|
if geom is None or geom.is_empty:
|
|
skipped_empty += 1
|
|
continue
|
|
if geom.geom_type not in {"Polygon", "MultiPolygon"}:
|
|
skipped_non_polygon += 1
|
|
continue
|
|
geometries.append(geom)
|
|
if not geometries:
|
|
raise ValueError(
|
|
f"{conservation_areas_path} does not contain any usable polygon geometries"
|
|
)
|
|
if skipped_other_dataset or skipped_ended or skipped_empty or skipped_non_polygon:
|
|
print(
|
|
"Skipped conservation-area records during load: "
|
|
f"other_dataset={skipped_other_dataset}, "
|
|
f"ended={skipped_ended}, "
|
|
f"empty_geometry={skipped_empty}, "
|
|
f"non_polygon={skipped_non_polygon}"
|
|
)
|
|
return geometries, _normalise_crs(metadata.get("crs"))
|
|
|
|
|
|
def _postcode_conservation_area_flags(
|
|
postcodes: pl.DataFrame,
|
|
conservation_geometries: list[BaseGeometry],
|
|
conservation_crs: object | None,
|
|
batch_size: int = 100_000,
|
|
) -> pl.DataFrame:
|
|
required = {"postcode", "lat", "lon"}
|
|
missing = sorted(required - set(postcodes.columns))
|
|
if missing:
|
|
raise ValueError(f"Postcode data missing required columns: {missing}")
|
|
|
|
all_postcodes = postcodes.select("postcode").drop_nulls().unique()
|
|
valid_points = postcodes.select("postcode", "lat", "lon").drop_nulls()
|
|
if valid_points.is_empty():
|
|
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
|
|
|
|
lat = valid_points["lat"].to_numpy()
|
|
lon = valid_points["lon"].to_numpy()
|
|
finite = np.isfinite(lat) & np.isfinite(lon)
|
|
valid_points = valid_points.filter(pl.Series(finite))
|
|
if valid_points.is_empty():
|
|
return all_postcodes.with_columns(pl.lit("No").alias(CONSERVATION_AREA_FEATURE))
|
|
|
|
lat = valid_points["lat"].to_numpy()
|
|
lon = valid_points["lon"].to_numpy()
|
|
transformer = Transformer.from_crs(
|
|
"EPSG:4326", _normalise_crs(conservation_crs), always_xy=True
|
|
)
|
|
x, y = transformer.transform(lon, lat)
|
|
|
|
tree = STRtree(conservation_geometries)
|
|
inside = np.zeros(valid_points.height, dtype=bool)
|
|
for start in range(0, valid_points.height, batch_size):
|
|
end = min(start + batch_size, valid_points.height)
|
|
point_batch = points(x[start:end], y[start:end])
|
|
matches = tree.query(point_batch, predicate="intersects")
|
|
if matches.size > 0:
|
|
inside[start + matches[0]] = True
|
|
|
|
matched = (
|
|
valid_points.select("postcode")
|
|
.with_columns(pl.Series("_within_conservation_area", inside))
|
|
.group_by("postcode")
|
|
.agg(pl.col("_within_conservation_area").max())
|
|
.with_columns(
|
|
pl.when(pl.col("_within_conservation_area"))
|
|
.then(pl.lit("Yes"))
|
|
.otherwise(pl.lit("No"))
|
|
.alias(CONSERVATION_AREA_FEATURE)
|
|
)
|
|
.select("postcode", CONSERVATION_AREA_FEATURE)
|
|
)
|
|
return (
|
|
all_postcodes.join(matched, on="postcode", how="left")
|
|
.with_columns(pl.col(CONSERVATION_AREA_FEATURE).fill_null("No"))
|
|
.select("postcode", CONSERVATION_AREA_FEATURE)
|
|
)
|
|
|
|
|
|
def _conservation_area_by_postcode(
|
|
postcodes: pl.LazyFrame,
|
|
conservation_areas_path: Path,
|
|
) -> pl.LazyFrame:
|
|
print(f"Loading conservation area polygons from {conservation_areas_path}...")
|
|
geometries, crs = _load_conservation_area_geometries(conservation_areas_path)
|
|
postcode_points = postcodes.select("postcode", "lat", "lon").collect(
|
|
engine="streaming"
|
|
)
|
|
print(
|
|
"Computing conservation area membership for "
|
|
f"{postcode_points.height} active English postcodes..."
|
|
)
|
|
return _postcode_conservation_area_flags(postcode_points, geometries, crs).lazy()
|
|
|
|
|
|
def _less_deprived_percentile_expr(column: str) -> pl.Expr:
|
|
"""Convert an IoD deprivation score to a 0-100 less-deprived percentile."""
|
|
non_null_count = pl.col(column).count()
|
|
descending_rank = pl.col(column).rank("average", descending=True)
|
|
return (
|
|
pl.when(pl.col(column).is_null())
|
|
.then(None)
|
|
.when(pl.col(column) == pl.col(column).min())
|
|
.then(100.0)
|
|
.when(pl.col(column) == pl.col(column).max())
|
|
.then(0.0)
|
|
.when(non_null_count > 1)
|
|
.then(((descending_rank - 1) / (non_null_count - 1) * 100).round(1))
|
|
.otherwise(100.0)
|
|
.alias(column)
|
|
)
|
|
|
|
|
|
def _tree_density_by_postcode(tree_density_postcodes_path: Path) -> pl.LazyFrame:
|
|
tree_density = pl.scan_parquet(tree_density_postcodes_path)
|
|
columns = set(tree_density.collect_schema().names())
|
|
if "postcode" not in columns:
|
|
raise ValueError(
|
|
f"{tree_density_postcodes_path} is missing required column: postcode"
|
|
)
|
|
|
|
if TREE_DENSITY_FEATURE in columns:
|
|
density_column = TREE_DENSITY_FEATURE
|
|
else:
|
|
candidates = sorted(
|
|
c for c in columns if _POSTCODE_TREE_DENSITY_PERCENTILE_RE.match(c)
|
|
)
|
|
if len(candidates) != 1:
|
|
raise ValueError(
|
|
f'{tree_density_postcodes_path} must contain column "{TREE_DENSITY_FEATURE}" '
|
|
'or exactly one "Tree canopy density percentile within {radius}m" column; '
|
|
f"found {len(candidates)} postcode percentile columns"
|
|
)
|
|
density_column = candidates[0]
|
|
|
|
return (
|
|
tree_density.select(
|
|
pl.col("postcode"),
|
|
pl.col(density_column).cast(pl.Float32).alias(TREE_DENSITY_FEATURE),
|
|
)
|
|
.drop_nulls(["postcode"])
|
|
.unique(["postcode"])
|
|
)
|
|
|
|
|
|
def _validate_lad_source_coverage(
|
|
iod_path: Path, ethnicity_path: Path, rental_prices_path: Path
|
|
) -> None:
|
|
iod_lads = (
|
|
pl.read_parquet(
|
|
iod_path,
|
|
columns=[
|
|
"Local Authority District code (2024)",
|
|
"Local Authority District name (2024)",
|
|
],
|
|
)
|
|
.rename(
|
|
{
|
|
"Local Authority District code (2024)": "lad",
|
|
"Local Authority District name (2024)": "lad_name",
|
|
}
|
|
)
|
|
.unique(["lad"])
|
|
)
|
|
|
|
ethnicity_lads = pl.read_parquet(ethnicity_path, columns=["Geography_code"]).rename(
|
|
{"Geography_code": "lad"}
|
|
)
|
|
missing_ethnicity = iod_lads.join(ethnicity_lads, on="lad", how="anti").sort("lad")
|
|
if missing_ethnicity.height > 0:
|
|
raise ValueError(
|
|
"Ethnicity data is missing 2024 LAD coverage: "
|
|
f"{missing_ethnicity.to_dicts()}"
|
|
)
|
|
|
|
rental_lads = pl.read_parquet(rental_prices_path, columns=["area_code"]).rename(
|
|
{"area_code": "lad"}
|
|
)
|
|
missing_rent = iod_lads.join(rental_lads, on="lad", how="anti").sort("lad")
|
|
unexpected_missing_rent = missing_rent.filter(
|
|
~pl.col("lad").is_in(list(_RENT_SOURCE_UNAVAILABLE_LADS))
|
|
)
|
|
if unexpected_missing_rent.height > 0:
|
|
raise ValueError(
|
|
"Rental data is missing 2024 LAD coverage: "
|
|
f"{unexpected_missing_rent.to_dicts()}"
|
|
)
|
|
if missing_rent.height > 0:
|
|
print(
|
|
"PIPR has no LAD-level rent estimates for source-unavailable LADs; "
|
|
f"rent will remain null there: {missing_rent.to_dicts()}"
|
|
)
|
|
|
|
|
|
def _validate_property_postcodes(df: pl.DataFrame) -> None:
|
|
invalid = df.filter(
|
|
pl.col("Postcode").is_null()
|
|
| (pl.col("Postcode").cast(pl.Utf8).str.strip_chars() == "")
|
|
)
|
|
if invalid.height == 0:
|
|
return
|
|
|
|
sample_cols = [
|
|
col
|
|
for col in ("Postcode", "Address per Property Register", "Last known price")
|
|
if col in invalid.columns
|
|
]
|
|
sample = invalid.select(sample_cols).head(10).to_dicts()
|
|
raise ValueError(
|
|
"Property rows missing a postcode after merge: "
|
|
f"{invalid.height} rows. Sample: {sample}"
|
|
)
|
|
|
|
|
|
# Map listings-parquet source columns to the `_actual_*` overlay columns
|
|
# carried alongside the wide frame through the postcode-keyed joins. After the
|
|
# rest of the pipeline finalises, listing rows pick their canonical dashboard
|
|
# values from these overlays in `_finalize_listings`.
|
|
_LISTING_OVERLAY_SOURCES: tuple[tuple[str, str, pl.DataType], ...] = (
|
|
("Listing URL", "_actual_listing_url", pl.Utf8),
|
|
("Asking price", "_actual_asking_price", pl.Int64),
|
|
("Asking price per sqm", "_actual_asking_price_per_sqm", pl.Int32),
|
|
("Listing date", "_actual_listing_date", pl.Datetime("us")),
|
|
("Listing status", "_actual_listing_status", pl.Utf8),
|
|
("Listing features", "_actual_listing_features", pl.List(pl.Utf8)),
|
|
("Bedrooms", "_actual_bedrooms", pl.Int32),
|
|
("Bathrooms", "_actual_bathrooms", pl.Int32),
|
|
("Price qualifier", "_actual_price_qualifier", pl.Utf8),
|
|
("Property sub-type", "_actual_property_sub_type", pl.Utf8),
|
|
("lat", "_actual_lat", pl.Float64),
|
|
("lon", "_actual_lon", pl.Float64),
|
|
# Seeds for the wide row that an unmatched listing produces.
|
|
("Total floor area (sqm)", "_actual_total_floor_area", pl.Float64),
|
|
("Number of bedrooms & living rooms", "_actual_number_habitable_rooms", pl.Int16),
|
|
("Property type", "_actual_property_type", pl.Utf8),
|
|
("Leasehold/Freehold", "_actual_leasehold_freehold", pl.Utf8),
|
|
)
|
|
_LISTING_FLAG_COLUMN = "_actual_listing_url"
|
|
_TENURE_VALUES = ["Freehold", "Leasehold"]
|
|
_PROPERTY_TYPE_VALUES = [
|
|
"Detached",
|
|
"Semi-Detached",
|
|
"Terraced",
|
|
"Flats/Maisonettes",
|
|
"Other",
|
|
]
|
|
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
|
|
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
|
_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
|
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
|
|
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
|
|
_PROPERTY_MATCH_MIN_MARGIN = 4.0
|
|
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
|
_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
|
_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
|
|
_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
|
|
_DIRECT_EPC_NEAREST_POSTCODES = 40
|
|
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
|
|
("_direct_epc_address", pl.Utf8),
|
|
("_direct_current_energy_rating", pl.Utf8),
|
|
("_direct_potential_energy_rating", pl.Utf8),
|
|
("_direct_total_floor_area", pl.Float64),
|
|
("_direct_number_habitable_rooms", pl.Int16),
|
|
("_direct_floor_height", pl.Float64),
|
|
("_direct_construction_age_band", pl.UInt16),
|
|
("_direct_is_construction_date_approximate", pl.UInt8),
|
|
("_direct_was_council_house", pl.Utf8),
|
|
("_direct_epc_match_status", pl.Utf8),
|
|
("_direct_epc_match_score", pl.Float32),
|
|
("_direct_epc_match_margin", pl.Float32),
|
|
)
|
|
_DIRECT_EPC_RAW_COLUMN_MAP = {
|
|
"epc_address": "_direct_epc_address",
|
|
"current_energy_rating": "_direct_current_energy_rating",
|
|
"potential_energy_rating": "_direct_potential_energy_rating",
|
|
"total_floor_area": "_direct_total_floor_area",
|
|
"number_habitable_rooms": "_direct_number_habitable_rooms",
|
|
"floor_height": "_direct_floor_height",
|
|
"construction_age_band": "_direct_construction_age_band",
|
|
"is_construction_date_approximate": "_direct_is_construction_date_approximate",
|
|
"was_council_house": "_direct_was_council_house",
|
|
}
|
|
|
|
|
|
def _canonical_postcode_expr(column: str) -> pl.Expr:
|
|
"""Re-format a postcode into NSPL `pcds` style (e.g. `AB1 2CD`) or null."""
|
|
compact = (
|
|
pl.col(column)
|
|
.cast(pl.Utf8)
|
|
.str.to_uppercase()
|
|
.str.replace_all(r"[^A-Z0-9]+", "")
|
|
.str.strip_chars()
|
|
)
|
|
return (
|
|
pl.when(compact.str.contains(r"^[A-Z]{1,2}\d[A-Z\d]?\d[A-Z]{2}$"))
|
|
.then(compact.str.replace(r"^(.+)([0-9][A-Z]{2})$", "${1} ${2}"))
|
|
.otherwise(None)
|
|
)
|
|
|
|
|
|
def _postcode_outcode_expr(column: str) -> pl.Expr:
|
|
return normalize_postcode_key(pl.col(column)).str.extract(
|
|
r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1
|
|
)
|
|
|
|
|
|
def _canonical_epc_property_type_expr() -> pl.Expr:
|
|
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
|
["NO DATA!", "Not Recorded"]
|
|
)
|
|
has_epc = pl.col("epc_property_type").is_not_null()
|
|
is_house = pl.col("epc_property_type") == "House"
|
|
return (
|
|
pl.when(has_epc & is_house & ~bad_built_form)
|
|
.then(pl.col("built_form"))
|
|
.when(has_epc)
|
|
.then(pl.col("epc_property_type"))
|
|
.otherwise(None)
|
|
.replace(
|
|
{
|
|
"Flat": "Flats/Maisonettes",
|
|
"Maisonette": "Flats/Maisonettes",
|
|
"End-Terrace": "Terraced",
|
|
"Mid-Terrace": "Terraced",
|
|
"Enclosed End-Terrace": "Terraced",
|
|
"Enclosed Mid-Terrace": "Terraced",
|
|
"Bungalow": "Other",
|
|
"Park home": "Other",
|
|
"House": "Other",
|
|
}
|
|
)
|
|
)
|
|
|
|
|
|
def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
|
return (
|
|
pl.col(column)
|
|
.cast(pl.Utf8)
|
|
.str.replace("England and Wales: ", "")
|
|
.str.replace(" onwards", "")
|
|
.str.extract(r"(\d{4})", 1)
|
|
.cast(pl.UInt16, strict=False)
|
|
)
|
|
|
|
|
|
def _ratio_bonus(
|
|
left: float | int | None, right: float | int | None, pct: float, cap: float
|
|
) -> float:
|
|
if left is None or right is None:
|
|
return 0.0
|
|
try:
|
|
left_f = float(left)
|
|
right_f = float(right)
|
|
except (TypeError, ValueError):
|
|
return 0.0
|
|
if left_f <= 0 or right_f <= 0:
|
|
return 0.0
|
|
rel = abs(left_f - right_f) / max(left_f, right_f)
|
|
if rel > pct:
|
|
return 0.0
|
|
return cap * (1.0 - rel / pct)
|
|
|
|
|
|
def _rooms_bonus(left: int | None, right: int | None) -> float:
|
|
if left is None or right is None:
|
|
return 0.0
|
|
try:
|
|
diff = abs(int(left) - int(right))
|
|
except (TypeError, ValueError):
|
|
return 0.0
|
|
if diff == 0:
|
|
return 4.0
|
|
if diff == 1:
|
|
return 2.0
|
|
return 0.0
|
|
|
|
|
|
def _enum_bonus(
|
|
left: str | None, right: str | None, *, exact: float, mismatch: float
|
|
) -> float:
|
|
if not left or not right:
|
|
return 0.0
|
|
return exact if left == right else mismatch
|
|
|
|
|
|
def _address_score(query: str, candidate: str | None) -> int:
|
|
if not candidate:
|
|
return 0
|
|
return max(
|
|
fuzz.token_set_ratio(query, candidate),
|
|
fuzz.token_sort_ratio(query, candidate),
|
|
)
|
|
|
|
|
|
def _has_number(address: str | None) -> bool:
|
|
return bool(address and _NUMBER_RE.search(address))
|
|
|
|
|
|
def _load_listings_for_merge(
|
|
listings_path: Path, arcgis_path: Path
|
|
) -> pl.DataFrame:
|
|
"""Read the listings parquet and prepare it for the wide-frame merge.
|
|
|
|
Output is keyed by `_listing_idx` and carries:
|
|
* `postcode` — canonical (NSPL `pcds`) form, with terminated postcodes
|
|
remapped to their nearest active successor;
|
|
* `pp_address` — the listing's raw register address (used as the
|
|
address half of the fuzzy match);
|
|
* one `_actual_*` overlay column per `_LISTING_OVERLAY_SOURCES` entry.
|
|
"""
|
|
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
|
|
postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
|
|
|
|
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
|
|
# treats NaN as distinct from null and the downstream `latest_price /
|
|
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
|
|
# to null at load time.
|
|
def _overlay_expr(src: str, dst: str, dtype: pl.DataType) -> pl.Expr:
|
|
expr = pl.col(src).cast(dtype, strict=False)
|
|
if dtype in (pl.Float32, pl.Float64):
|
|
expr = expr.fill_nan(None)
|
|
return expr.alias(dst)
|
|
|
|
overlay = [
|
|
_overlay_expr(src, dst, dtype) for src, dst, dtype in _LISTING_OVERLAY_SOURCES
|
|
]
|
|
return (
|
|
raw.with_columns(
|
|
_canonical_postcode_expr("Postcode").alias("_canonical_postcode"),
|
|
)
|
|
.join(
|
|
postcode_mapping,
|
|
left_on="_canonical_postcode",
|
|
right_on="old_postcode",
|
|
how="left",
|
|
)
|
|
.with_columns(
|
|
pl.coalesce("new_postcode", "_canonical_postcode", "Postcode").alias(
|
|
"postcode"
|
|
),
|
|
pl.col("Address per Property Register").alias("pp_address"),
|
|
*overlay,
|
|
)
|
|
.select(
|
|
"_listing_idx",
|
|
"postcode",
|
|
"pp_address",
|
|
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
|
|
def _ensure_direct_epc_columns(df: pl.DataFrame) -> pl.DataFrame:
|
|
missing_exprs = [
|
|
pl.lit(None, dtype=dtype).alias(column)
|
|
for column, dtype in _DIRECT_EPC_COLUMNS
|
|
if column not in df.columns
|
|
]
|
|
if not missing_exprs:
|
|
return df
|
|
return df.with_columns(missing_exprs)
|
|
|
|
|
|
def _direct_epc_match_schema() -> dict[str, pl.DataType]:
|
|
return {
|
|
"_listing_idx": pl.UInt32,
|
|
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS},
|
|
}
|
|
|
|
|
|
def _empty_direct_epc_matches() -> pl.DataFrame:
|
|
return pl.DataFrame(schema=_direct_epc_match_schema())
|
|
|
|
|
|
def _load_direct_epc_candidates(
|
|
epc_path: Path,
|
|
arcgis_path: Path,
|
|
listing_outcodes: list[str],
|
|
temp_dir: Path,
|
|
) -> pl.DataFrame:
|
|
schema = {
|
|
"_direct_epc_row": pl.UInt32,
|
|
"_direct_epc_match_address": pl.Utf8,
|
|
"_direct_epc_match_postcode": pl.Utf8,
|
|
"_direct_epc_outcode": pl.Utf8,
|
|
"_direct_epc_canonical_property_type": pl.Utf8,
|
|
"_direct_epc_east": pl.Float64,
|
|
"_direct_epc_north": pl.Float64,
|
|
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
|
|
}
|
|
if not listing_outcodes:
|
|
return pl.DataFrame(schema=schema)
|
|
|
|
epc_base = (
|
|
_scan_epc_certificates(epc_path, temp_dir)
|
|
.with_columns(
|
|
normalize_address_key(pl.col("epc_address")).alias(
|
|
"_direct_epc_match_address"
|
|
),
|
|
normalize_postcode_key(pl.col("epc_postcode")).alias(
|
|
"_direct_epc_match_postcode"
|
|
),
|
|
)
|
|
.with_columns(
|
|
pl.col("_direct_epc_match_postcode")
|
|
.str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1)
|
|
.alias("_direct_epc_outcode")
|
|
)
|
|
.filter(pl.col("_direct_epc_outcode").is_in(listing_outcodes))
|
|
.filter(pl.col("_direct_epc_match_address").is_not_null())
|
|
.filter(pl.col("_direct_epc_match_postcode").is_not_null())
|
|
)
|
|
|
|
social_tenure = (
|
|
epc_base.filter(pl.col("tenure").str.to_lowercase().str.contains("social"))
|
|
.select("_direct_epc_match_address", "_direct_epc_match_postcode")
|
|
.unique()
|
|
.with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
|
|
)
|
|
|
|
arcgis = pl.scan_parquet(arcgis_path).select(
|
|
normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
|
|
pl.col("east1m").alias("_direct_epc_east"),
|
|
pl.col("north1m").alias("_direct_epc_north"),
|
|
)
|
|
|
|
return (
|
|
epc_base.sort("inspection_date", descending=True)
|
|
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
|
|
.first()
|
|
.join(
|
|
social_tenure,
|
|
on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
|
|
how="left",
|
|
)
|
|
.join(arcgis, on="_direct_epc_match_postcode", how="left")
|
|
.with_columns(
|
|
_canonical_epc_property_type_expr().alias(
|
|
"_direct_epc_canonical_property_type"
|
|
),
|
|
_construction_year_expr().alias("_direct_construction_age_band"),
|
|
pl.when(pl.col("current_energy_rating").is_in(_EPC_RATING_VALUES))
|
|
.then(pl.col("current_energy_rating"))
|
|
.otherwise(None)
|
|
.alias("_direct_current_energy_rating"),
|
|
pl.when(pl.col("potential_energy_rating").is_in(_EPC_RATING_VALUES))
|
|
.then(pl.col("potential_energy_rating"))
|
|
.otherwise(None)
|
|
.alias("_direct_potential_energy_rating"),
|
|
pl.col("epc_address").alias("_direct_epc_address"),
|
|
pl.col("total_floor_area").alias("_direct_total_floor_area"),
|
|
pl.col("number_habitable_rooms").alias(
|
|
"_direct_number_habitable_rooms"
|
|
),
|
|
pl.col("floor_height").alias("_direct_floor_height"),
|
|
pl.col("_direct_was_council_house").fill_null("No"),
|
|
)
|
|
.with_columns(
|
|
pl.when(pl.col("_direct_construction_age_band").is_not_null())
|
|
.then(pl.lit(1, dtype=pl.UInt8))
|
|
.otherwise(pl.lit(None, dtype=pl.UInt8))
|
|
.alias("_direct_is_construction_date_approximate")
|
|
)
|
|
.with_row_index("_direct_epc_row")
|
|
.select(
|
|
"_direct_epc_row",
|
|
"_direct_epc_match_address",
|
|
"_direct_epc_match_postcode",
|
|
"_direct_epc_outcode",
|
|
"_direct_epc_canonical_property_type",
|
|
"_direct_epc_east",
|
|
"_direct_epc_north",
|
|
"_direct_epc_address",
|
|
"_direct_current_energy_rating",
|
|
"_direct_potential_energy_rating",
|
|
"_direct_total_floor_area",
|
|
"_direct_number_habitable_rooms",
|
|
"_direct_floor_height",
|
|
"_direct_construction_age_band",
|
|
"_direct_is_construction_date_approximate",
|
|
"_direct_was_council_house",
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
|
|
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
|
match = listings.with_columns(
|
|
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
|
|
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
|
|
).with_columns(
|
|
pl.col("_listing_match_postcode")
|
|
.str.extract(r"^([A-Z]{1,2}\d[A-Z\d]?)\d[A-Z]{2}$", 1)
|
|
.alias("_listing_outcode")
|
|
)
|
|
|
|
if match.is_empty():
|
|
return match.with_columns(
|
|
pl.Series("_listing_east", [], dtype=pl.Float64),
|
|
pl.Series("_listing_north", [], dtype=pl.Float64),
|
|
)
|
|
|
|
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
|
east, north = transformer.transform(
|
|
match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
|
|
)
|
|
return match.with_columns(
|
|
pl.Series("_listing_east", east, dtype=pl.Float64),
|
|
pl.Series("_listing_north", north, dtype=pl.Float64),
|
|
)
|
|
|
|
|
|
def _optional_lazy_col(
|
|
schema: pl.Schema, column: str, dtype: pl.DataType
|
|
) -> pl.Expr:
|
|
if column in schema:
|
|
return pl.col(column).cast(dtype, strict=False).alias(column)
|
|
return pl.lit(None, dtype=dtype).alias(column)
|
|
|
|
|
|
def _listing_property_match_schema() -> dict[str, pl.DataType]:
|
|
return {
|
|
"_listing_idx": pl.UInt32,
|
|
"_matched_postcode": pl.Utf8,
|
|
"_matched_pp_address": pl.Utf8,
|
|
"_property_match_score": pl.Float32,
|
|
"_property_match_address_score": pl.Int32,
|
|
"_property_match_margin": pl.Float32,
|
|
"_property_match_field": pl.Utf8,
|
|
}
|
|
|
|
|
|
def _empty_listing_property_matches() -> pl.DataFrame:
|
|
return pl.DataFrame(schema=_listing_property_match_schema())
|
|
|
|
|
|
def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
|
|
schema = wide.collect_schema()
|
|
return (
|
|
wide.select(
|
|
pl.col("postcode").cast(pl.Utf8).alias("postcode"),
|
|
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
|
|
_optional_lazy_col(schema, "epc_address", pl.Utf8),
|
|
_optional_lazy_col(schema, "pp_property_type", pl.Utf8),
|
|
_optional_lazy_col(schema, "duration", pl.Utf8),
|
|
_optional_lazy_col(schema, "total_floor_area", pl.Float64),
|
|
_optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
|
|
_optional_lazy_col(schema, "latest_price", pl.Int64),
|
|
)
|
|
.with_row_index("_property_row")
|
|
.with_columns(
|
|
normalize_postcode_key(pl.col("postcode")).alias(
|
|
"_property_match_postcode"
|
|
),
|
|
normalize_address_key(pl.col("pp_address")).alias(
|
|
"_property_match_address"
|
|
),
|
|
normalize_address_key(pl.col("epc_address")).alias(
|
|
"_property_epc_match_address"
|
|
),
|
|
)
|
|
.filter(pl.col("pp_address").is_not_null())
|
|
.filter(pl.col("_property_match_postcode").is_not_null())
|
|
.filter(
|
|
pl.col("_property_match_address").is_not_null()
|
|
| pl.col("_property_epc_match_address").is_not_null()
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
|
|
|
|
def _property_candidates_by_postcode(
|
|
candidates: pl.DataFrame,
|
|
) -> dict[str, list[dict]]:
|
|
buckets: dict[str, list[dict]] = {}
|
|
for row in candidates.iter_rows(named=True):
|
|
postcode = row.get("_property_match_postcode")
|
|
if postcode:
|
|
buckets.setdefault(postcode, []).append(row)
|
|
return buckets
|
|
|
|
|
|
def _best_listing_property_candidate(
|
|
listing: dict, candidates: list[dict]
|
|
) -> dict | None:
|
|
query = listing.get("_listing_match_address")
|
|
if not query:
|
|
return None
|
|
|
|
listing_has_numbers = _has_number(query)
|
|
scored: list[tuple[float, int, dict, str]] = []
|
|
for candidate in candidates:
|
|
register_address = candidate.get("_property_match_address")
|
|
epc_address = candidate.get("_property_epc_match_address")
|
|
register_numbers_compatible = bool(
|
|
register_address and _numbers_compatible(query, register_address)
|
|
)
|
|
epc_numbers_compatible = bool(
|
|
epc_address and _numbers_compatible(query, epc_address)
|
|
)
|
|
if not (register_numbers_compatible or epc_numbers_compatible):
|
|
continue
|
|
|
|
register_score = _address_score(query, register_address)
|
|
epc_score = _address_score(query, epc_address)
|
|
base_score = max(register_score, epc_score)
|
|
if base_score == 0:
|
|
continue
|
|
|
|
score = float(base_score)
|
|
score += _enum_bonus(
|
|
listing.get("_actual_property_type"),
|
|
candidate.get("pp_property_type"),
|
|
exact=7.0,
|
|
mismatch=-8.0,
|
|
)
|
|
score += _enum_bonus(
|
|
listing.get("_actual_leasehold_freehold"),
|
|
candidate.get("duration"),
|
|
exact=3.0,
|
|
mismatch=-3.0,
|
|
)
|
|
score += _ratio_bonus(
|
|
listing.get("_actual_total_floor_area"),
|
|
candidate.get("total_floor_area"),
|
|
pct=0.15,
|
|
cap=8.0,
|
|
)
|
|
score += _rooms_bonus(
|
|
listing.get("_actual_number_habitable_rooms"),
|
|
candidate.get("number_habitable_rooms"),
|
|
)
|
|
score += _ratio_bonus(
|
|
listing.get("_actual_asking_price"),
|
|
candidate.get("latest_price"),
|
|
pct=0.25,
|
|
cap=3.0,
|
|
)
|
|
matched_field = (
|
|
"pp_address" if register_score >= epc_score else "epc_address"
|
|
)
|
|
scored.append((score, base_score, candidate, matched_field))
|
|
|
|
if not scored:
|
|
return None
|
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
top = scored[0]
|
|
runner_up = scored[1][0] if len(scored) > 1 else None
|
|
margin = top[0] - runner_up if runner_up is not None else top[0]
|
|
score_threshold = (
|
|
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
|
|
if listing_has_numbers
|
|
else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
|
)
|
|
address_threshold = (
|
|
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
|
|
if listing_has_numbers
|
|
else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
|
|
)
|
|
if (
|
|
top[0] < score_threshold
|
|
or top[1] < address_threshold
|
|
or margin < _PROPERTY_MATCH_MIN_MARGIN
|
|
):
|
|
return None
|
|
|
|
candidate = top[2]
|
|
return {
|
|
"_listing_idx": listing["_listing_idx"],
|
|
"_matched_postcode": candidate.get("postcode"),
|
|
"_matched_pp_address": candidate.get("pp_address"),
|
|
"_property_match_score": round(top[0], 1),
|
|
"_property_match_address_score": top[1],
|
|
"_property_match_margin": round(margin, 1),
|
|
"_property_match_field": top[3],
|
|
}
|
|
|
|
|
|
def _match_listing_properties(
|
|
listing_matches: pl.DataFrame, property_candidates: pl.DataFrame
|
|
) -> pl.DataFrame:
|
|
if listing_matches.is_empty() or property_candidates.is_empty():
|
|
return _empty_listing_property_matches()
|
|
|
|
buckets = _property_candidates_by_postcode(property_candidates)
|
|
best_matches = []
|
|
for listing in listing_matches.iter_rows(named=True):
|
|
postcode = listing.get("_listing_match_postcode")
|
|
if not postcode:
|
|
continue
|
|
match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
|
|
if match is not None:
|
|
best_matches.append(match)
|
|
|
|
if not best_matches:
|
|
return _empty_listing_property_matches()
|
|
|
|
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
|
|
return (
|
|
matches.sort(
|
|
["_property_match_score", "_listing_idx"], descending=[True, False]
|
|
)
|
|
.unique(
|
|
["_matched_postcode", "_matched_pp_address"],
|
|
keep="first",
|
|
maintain_order=True,
|
|
)
|
|
.sort("_listing_idx")
|
|
)
|
|
|
|
|
|
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
|
|
buckets: dict[str, list[dict]] = {}
|
|
for row in candidates.iter_rows(named=True):
|
|
postcode = row.get("_direct_epc_match_postcode")
|
|
if postcode:
|
|
buckets.setdefault(postcode, []).append(row)
|
|
return buckets
|
|
|
|
|
|
def _epc_postcode_tree(
|
|
candidates: pl.DataFrame,
|
|
) -> tuple[cKDTree | None, list[str]]:
|
|
postcode_points = (
|
|
candidates.select(
|
|
"_direct_epc_match_postcode",
|
|
"_direct_epc_east",
|
|
"_direct_epc_north",
|
|
)
|
|
.drop_nulls()
|
|
.filter(
|
|
pl.col("_direct_epc_east").is_finite()
|
|
& pl.col("_direct_epc_north").is_finite()
|
|
)
|
|
.unique("_direct_epc_match_postcode")
|
|
)
|
|
if postcode_points.is_empty():
|
|
return None, []
|
|
coords = np.column_stack(
|
|
[
|
|
postcode_points["_direct_epc_east"].to_numpy(),
|
|
postcode_points["_direct_epc_north"].to_numpy(),
|
|
]
|
|
)
|
|
return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
|
|
|
|
|
|
def _candidate_postcodes_for_listing(
|
|
listing: dict,
|
|
postcode_tree: cKDTree | None,
|
|
postcode_values: list[str],
|
|
) -> list[str]:
|
|
postcodes: list[str] = []
|
|
exact = listing.get("_listing_match_postcode")
|
|
if exact:
|
|
postcodes.append(exact)
|
|
|
|
if postcode_tree is None:
|
|
return postcodes
|
|
|
|
east = listing.get("_listing_east")
|
|
north = listing.get("_listing_north")
|
|
try:
|
|
east_f = float(east)
|
|
north_f = float(north)
|
|
except (TypeError, ValueError):
|
|
return postcodes
|
|
if not np.isfinite(east_f) or not np.isfinite(north_f):
|
|
return postcodes
|
|
|
|
k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
|
|
distances, indices = postcode_tree.query(
|
|
[east_f, north_f],
|
|
k=k,
|
|
distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
|
|
)
|
|
distances = np.atleast_1d(distances)
|
|
indices = np.atleast_1d(indices)
|
|
seen = set(postcodes)
|
|
for distance, idx in zip(distances, indices, strict=False):
|
|
if not np.isfinite(distance) or idx >= len(postcode_values):
|
|
continue
|
|
postcode = postcode_values[int(idx)]
|
|
if postcode not in seen:
|
|
postcodes.append(postcode)
|
|
seen.add(postcode)
|
|
return postcodes
|
|
|
|
|
|
def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
|
query = listing.get("_listing_match_address")
|
|
if not query:
|
|
return None
|
|
|
|
listing_has_numbers = _has_number(query)
|
|
scored: list[tuple[float, int, dict]] = []
|
|
for candidate in candidates:
|
|
address = candidate.get("_direct_epc_match_address")
|
|
if listing_has_numbers and not _numbers_compatible(query, address or ""):
|
|
continue
|
|
base_score = _address_score(query, address)
|
|
if base_score == 0:
|
|
continue
|
|
|
|
score = float(base_score)
|
|
score += _enum_bonus(
|
|
listing.get("_actual_property_type"),
|
|
candidate.get("_direct_epc_canonical_property_type"),
|
|
exact=6.0,
|
|
mismatch=-6.0,
|
|
)
|
|
score += _ratio_bonus(
|
|
listing.get("_actual_total_floor_area"),
|
|
candidate.get("_direct_total_floor_area"),
|
|
pct=0.12,
|
|
cap=8.0,
|
|
)
|
|
score += _rooms_bonus(
|
|
listing.get("_actual_number_habitable_rooms"),
|
|
candidate.get("_direct_number_habitable_rooms"),
|
|
)
|
|
scored.append((score, base_score, candidate))
|
|
|
|
if not scored:
|
|
return None
|
|
scored.sort(key=lambda item: item[0], reverse=True)
|
|
top = scored[0]
|
|
runner_up = scored[1][0] if len(scored) > 1 else None
|
|
margin = top[0] - runner_up if runner_up is not None else top[0]
|
|
threshold = (
|
|
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
|
|
if listing_has_numbers
|
|
else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
|
)
|
|
if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
|
|
return None
|
|
|
|
candidate = top[2]
|
|
return {
|
|
"_listing_idx": listing["_listing_idx"],
|
|
"_direct_epc_address": candidate.get("_direct_epc_address"),
|
|
"_direct_current_energy_rating": candidate.get("_direct_current_energy_rating"),
|
|
"_direct_potential_energy_rating": candidate.get(
|
|
"_direct_potential_energy_rating"
|
|
),
|
|
"_direct_total_floor_area": candidate.get("_direct_total_floor_area"),
|
|
"_direct_number_habitable_rooms": candidate.get(
|
|
"_direct_number_habitable_rooms"
|
|
),
|
|
"_direct_floor_height": candidate.get("_direct_floor_height"),
|
|
"_direct_construction_age_band": candidate.get("_direct_construction_age_band"),
|
|
"_direct_is_construction_date_approximate": candidate.get(
|
|
"_direct_is_construction_date_approximate"
|
|
),
|
|
"_direct_was_council_house": candidate.get("_direct_was_council_house"),
|
|
"_direct_epc_match_status": "matched",
|
|
"_direct_epc_match_score": round(top[0], 1),
|
|
"_direct_epc_match_margin": round(margin, 1),
|
|
}
|
|
|
|
|
|
def _match_direct_epc(
|
|
listing_matches: pl.DataFrame, epc_candidates: pl.DataFrame
|
|
) -> pl.DataFrame:
|
|
if listing_matches.is_empty() or epc_candidates.is_empty():
|
|
return _empty_direct_epc_matches()
|
|
|
|
buckets = _epc_candidates_by_postcode(epc_candidates)
|
|
postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)
|
|
|
|
matches = []
|
|
for listing in listing_matches.iter_rows(named=True):
|
|
candidate_postcodes = _candidate_postcodes_for_listing(
|
|
listing, postcode_tree, postcode_values
|
|
)
|
|
candidate_rows: list[dict] = []
|
|
seen_rows: set[int] = set()
|
|
for postcode in candidate_postcodes:
|
|
for candidate in buckets.get(postcode, []):
|
|
row = candidate.get("_direct_epc_row")
|
|
if row in seen_rows:
|
|
continue
|
|
candidate_rows.append(candidate)
|
|
if row is not None:
|
|
seen_rows.add(row)
|
|
match = _best_direct_epc_candidate(listing, candidate_rows)
|
|
if match is not None:
|
|
matches.append(match)
|
|
|
|
if not matches:
|
|
return _empty_direct_epc_matches()
|
|
return pl.DataFrame(matches, schema=_direct_epc_match_schema())
|
|
|
|
|
|
def _enrich_listings_with_direct_epc(
|
|
listings: pl.DataFrame,
|
|
epc_path: Path | None,
|
|
arcgis_path: Path,
|
|
) -> pl.DataFrame:
|
|
if epc_path is None:
|
|
return _ensure_direct_epc_columns(listings)
|
|
|
|
listing_matches = _listing_match_frame(listings)
|
|
listing_outcodes = (
|
|
listing_matches.select("_listing_outcode")
|
|
.drop_nulls()
|
|
.unique()
|
|
.to_series()
|
|
.to_list()
|
|
)
|
|
if not listing_outcodes:
|
|
return _ensure_direct_epc_columns(listings)
|
|
|
|
with tempfile.TemporaryDirectory(
|
|
prefix="direct_listing_epc_", dir=local_tmp_dir()
|
|
) as tmpdir:
|
|
epc_candidates = _load_direct_epc_candidates(
|
|
epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
|
|
)
|
|
print(f"Direct listing EPC candidates: {epc_candidates.height}")
|
|
direct_matches = _match_direct_epc(listing_matches, epc_candidates)
|
|
|
|
print(f"Direct listing EPC matches: {direct_matches.height}")
|
|
if direct_matches.is_empty():
|
|
return _ensure_direct_epc_columns(listings)
|
|
|
|
return _ensure_direct_epc_columns(
|
|
listings.join(direct_matches, on="_listing_idx", how="left")
|
|
)
|
|
|
|
|
|
def _coalesce_direct_epc_columns(wide: pl.LazyFrame) -> pl.LazyFrame:
|
|
return wide.with_columns(
|
|
[
|
|
pl.coalesce(pl.col(raw_column), pl.col(direct_column)).alias(raw_column)
|
|
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items()
|
|
]
|
|
)
|
|
|
|
|
|
def _build_unmatched_listing_seed_rows(
|
|
unmatched_listing_idxs: pl.DataFrame,
|
|
listings: pl.DataFrame,
|
|
template_schema: pl.Schema,
|
|
) -> pl.DataFrame:
|
|
"""Materialise wide-shape rows for listings that didn't match any property.
|
|
|
|
Each seed row carries enough columns (postcode, pp_address, property type,
|
|
tenure, floor area, room count, asking price → latest_price) for the
|
|
postcode-keyed joins later in `_build` to fill in the rest. All other
|
|
wide columns are null on the seed row.
|
|
"""
|
|
if unmatched_listing_idxs.is_empty():
|
|
return pl.DataFrame(schema=template_schema)
|
|
|
|
listings = _ensure_direct_epc_columns(listings)
|
|
base = unmatched_listing_idxs.join(listings, on="_listing_idx", how="inner")
|
|
|
|
populated: dict[str, pl.Expr] = {
|
|
"postcode": pl.col("postcode"),
|
|
"pp_address": pl.col("pp_address"),
|
|
"pp_property_type": pl.col("_actual_property_type"),
|
|
"duration": pl.col("_actual_leasehold_freehold"),
|
|
"total_floor_area": pl.coalesce(
|
|
pl.col("_actual_total_floor_area"), pl.col("_direct_total_floor_area")
|
|
),
|
|
"number_habitable_rooms": pl.coalesce(
|
|
pl.col("_actual_number_habitable_rooms"),
|
|
pl.col("_direct_number_habitable_rooms"),
|
|
),
|
|
"latest_price": pl.col("_actual_asking_price"),
|
|
}
|
|
for raw_column, direct_column in _DIRECT_EPC_RAW_COLUMN_MAP.items():
|
|
if raw_column in populated:
|
|
continue
|
|
populated[raw_column] = pl.col(direct_column)
|
|
for _src, dst, _dt in _LISTING_OVERLAY_SOURCES:
|
|
populated[dst] = pl.col(dst)
|
|
|
|
seed_exprs: list[pl.Expr] = []
|
|
for col_name, dtype in template_schema.items():
|
|
if col_name in populated:
|
|
seed_exprs.append(
|
|
populated[col_name].cast(dtype, strict=False).alias(col_name)
|
|
)
|
|
else:
|
|
seed_exprs.append(pl.lit(None, dtype=dtype).alias(col_name))
|
|
|
|
return base.select(seed_exprs)
|
|
|
|
|
|
def _integrate_listings(
|
|
wide: pl.LazyFrame,
|
|
listings_path: Path,
|
|
arcgis_path: Path,
|
|
epc_path: Path | None = None,
|
|
) -> pl.LazyFrame:
|
|
"""Splice actual listings into the wide property frame.
|
|
|
|
Listings are fuzzy-matched to wide rows on (postcode, pp_address). Matched
|
|
listings stamp `_actual_*` overlay columns onto the existing wide row, so
|
|
historical context (EPC, last sale, etc.) is preserved. Unmatched listings
|
|
are appended as new wide rows with enough property-shape fields filled in
|
|
that the downstream postcode-keyed joins (deprivation, crime, tree
|
|
density, …) populate them just like any other row.
|
|
"""
|
|
listings = _load_listings_for_merge(listings_path, arcgis_path)
|
|
print(f"Listings loaded: {listings.height}")
|
|
listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)
|
|
|
|
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
|
|
listing_attachment_columns = [
|
|
*overlay_columns,
|
|
*[column for column, _dtype in _DIRECT_EPC_COLUMNS],
|
|
]
|
|
|
|
property_candidates = _property_match_candidate_frame(wide)
|
|
joined = _match_listing_properties(
|
|
_listing_match_frame(listings), property_candidates
|
|
)
|
|
|
|
total = listings.height
|
|
matched_count = joined.height
|
|
if total > 0:
|
|
print(
|
|
"Listings matched to existing wide rows: "
|
|
f"{matched_count}/{total} "
|
|
f"({100 * matched_count / total:.1f}%)"
|
|
)
|
|
|
|
overlay_for_matched = (
|
|
joined.join(listings, on="_listing_idx", how="inner")
|
|
.select(
|
|
pl.col("_matched_postcode").alias("postcode"),
|
|
pl.col("_matched_pp_address").alias("pp_address"),
|
|
*listing_attachment_columns,
|
|
)
|
|
.unique(["postcode", "pp_address"], keep="first")
|
|
)
|
|
|
|
wide_attached = wide.join(
|
|
overlay_for_matched.lazy(), on=["postcode", "pp_address"], how="left"
|
|
)
|
|
wide_attached = _coalesce_direct_epc_columns(wide_attached)
|
|
wide_output = wide_attached.drop(
|
|
[column for column, _dtype in _DIRECT_EPC_COLUMNS], strict=False
|
|
)
|
|
|
|
unmatched_listing_idxs = listings.select("_listing_idx").join(
|
|
joined.select("_listing_idx"), on="_listing_idx", how="anti"
|
|
)
|
|
seed_rows = _build_unmatched_listing_seed_rows(
|
|
unmatched_listing_idxs,
|
|
listings,
|
|
template_schema=wide_output.collect_schema(),
|
|
)
|
|
|
|
return pl.concat([wide_output, seed_rows.lazy()], how="vertical_relaxed")
|
|
|
|
|
|
def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
|
|
"""Project the post-rename wide frame down to enriched-listing rows."""
|
|
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
|
|
|
|
df = df.with_columns(
|
|
pl.col("_actual_listing_url").alias("Listing URL"),
|
|
pl.col("_actual_listing_date").alias("Listing date"),
|
|
pl.col("_actual_listing_status").alias("Listing status"),
|
|
pl.col("_actual_listing_features").alias("Listing features"),
|
|
pl.col("_actual_asking_price").alias("Asking price"),
|
|
pl.col("_actual_asking_price_per_sqm").alias("Asking price per sqm"),
|
|
pl.col("_actual_bedrooms").alias("Bedrooms"),
|
|
pl.col("_actual_bathrooms").alias("Bathrooms"),
|
|
pl.col("_actual_price_qualifier").alias("Price qualifier"),
|
|
pl.col("_actual_property_sub_type").alias("Property sub-type"),
|
|
# Listing coordinates win over the postcode centroid.
|
|
pl.coalesce(pl.col("_actual_lat").cast(pl.Float64), pl.col("lat")).alias("lat"),
|
|
pl.coalesce(pl.col("_actual_lon").cast(pl.Float64), pl.col("lon")).alias("lon"),
|
|
# Listing's floor area / rooms override any EPC/PP value when present.
|
|
pl.coalesce(
|
|
pl.col("_actual_total_floor_area").cast(pl.Float64),
|
|
pl.col("Total floor area (sqm)"),
|
|
).alias("Total floor area (sqm)"),
|
|
pl.coalesce(
|
|
pl.col("_actual_number_habitable_rooms").cast(pl.Int16),
|
|
pl.col("Number of bedrooms & living rooms"),
|
|
).alias("Number of bedrooms & living rooms"),
|
|
pl.when(pl.col("_actual_property_type").is_in(_PROPERTY_TYPE_VALUES))
|
|
.then(pl.col("_actual_property_type"))
|
|
.otherwise(pl.col("Property type"))
|
|
.alias("Property type"),
|
|
pl.when(pl.col("_actual_leasehold_freehold").is_in(_TENURE_VALUES))
|
|
.then(pl.col("_actual_leasehold_freehold"))
|
|
.otherwise(pl.col("Leasehold/Freehold"))
|
|
.alias("Leasehold/Freehold"),
|
|
)
|
|
|
|
df = df.with_columns(
|
|
pl.coalesce(
|
|
pl.col("Asking price per sqm"),
|
|
pl.when(
|
|
pl.col("Asking price").is_not_null()
|
|
& pl.col("Total floor area (sqm)").is_not_null()
|
|
& (pl.col("Total floor area (sqm)") > MIN_FLOOR_AREA_M2)
|
|
)
|
|
.then(
|
|
(
|
|
pl.col("Asking price").cast(pl.Float64)
|
|
/ pl.col("Total floor area (sqm)")
|
|
)
|
|
.round(0)
|
|
.cast(pl.Int32, strict=False)
|
|
)
|
|
.otherwise(None),
|
|
).alias("Asking price per sqm")
|
|
)
|
|
|
|
df = df.with_columns(
|
|
pl.col("Asking price").alias("Estimated current price"),
|
|
pl.col("Asking price per sqm").alias("Est. price per sqm"),
|
|
pl.coalesce(pl.col("Last known price"), pl.col("Asking price")).alias(
|
|
"Last known price"
|
|
),
|
|
pl.when(pl.col("Date of last transaction").is_not_null())
|
|
.then(pl.lit("matched"))
|
|
.otherwise(pl.lit("unmatched"))
|
|
.alias("Historical property match status"),
|
|
)
|
|
|
|
drop_cols = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
|
|
return df.drop(drop_cols, strict=False)
|
|
|
|
|
|
@dataclass
|
|
class _BuildResult:
|
|
"""Outputs of `_build` — exactly one of the two slot pairs is populated."""
|
|
|
|
postcode: pl.DataFrame | None = None
|
|
properties: pl.DataFrame | None = None
|
|
listings: pl.DataFrame | None = None
|
|
|
|
|
|
def _build(
|
|
epc_pp_path: Path,
|
|
arcgis_path: Path,
|
|
iod_path: Path,
|
|
poi_proximity_path: Path,
|
|
ethnicity_path: Path,
|
|
crime_path: Path,
|
|
noise_path: Path,
|
|
school_proximity_path: Path,
|
|
broadband_path: Path,
|
|
conservation_areas_path: Path,
|
|
rental_prices_path: Path,
|
|
lsoa_population_path: Path,
|
|
median_age_path: Path,
|
|
election_results_path: Path,
|
|
tree_density_postcodes_path: Path | None = None,
|
|
listed_buildings_path: Path | None = None,
|
|
actual_listings_path: Path | None = None,
|
|
actual_listings_epc_path: Path | None = None,
|
|
mode: Literal["normal", "listings"] = "normal",
|
|
) -> _BuildResult:
|
|
"""Build postcode/properties dataframes (or enriched listings) from epc_pp + auxiliary data.
|
|
|
|
Modes:
|
|
* `normal` — produces (postcode_df, properties_df) as before. Ignores
|
|
`actual_listings_path` if supplied.
|
|
* `listings` — requires `actual_listings_path`; produces a single
|
|
enriched-listings DataFrame and skips the postcode/properties outputs.
|
|
Listings flow through the same enrichment joins as historical rows,
|
|
so postcode-scoped features (tree density, crime, deprivation, …) end
|
|
up populated on every listing with a valid postcode.
|
|
"""
|
|
if mode == "listings" and actual_listings_path is None:
|
|
raise ValueError("listings mode requires actual_listings_path")
|
|
_validate_lad_source_coverage(iod_path, ethnicity_path, rental_prices_path)
|
|
|
|
wide = pl.scan_parquet(epc_pp_path).filter(
|
|
pl.col("total_floor_area").is_null()
|
|
| (pl.col("total_floor_area") > MIN_FLOOR_AREA_M2)
|
|
)
|
|
|
|
# Remap terminated postcodes to nearest active successor
|
|
postcode_mapping = build_postcode_mapping(arcgis_path)
|
|
wide = (
|
|
wide.join(
|
|
postcode_mapping.lazy(),
|
|
left_on="postcode",
|
|
right_on="old_postcode",
|
|
how="left",
|
|
)
|
|
.with_columns(
|
|
pl.coalesce("new_postcode", "postcode").alias("postcode"),
|
|
)
|
|
.drop("new_postcode")
|
|
)
|
|
|
|
arcgis_raw = pl.scan_parquet(arcgis_path)
|
|
postcode_country = arcgis_raw.select(
|
|
pl.col("pcds").alias("postcode"),
|
|
pl.col("ctry25cd"),
|
|
).unique(["postcode"])
|
|
wide = wide.join(postcode_country, on="postcode", how="left")
|
|
|
|
if listed_buildings_path is not None:
|
|
active_postcodes_for_listed = (
|
|
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001")
|
|
.filter(pl.col("doterm").is_null())
|
|
.select(
|
|
pl.col("pcds").alias("postcode"),
|
|
"east1m",
|
|
"north1m",
|
|
)
|
|
.collect(engine="streaming")
|
|
)
|
|
listed_flags = _listed_building_flags(
|
|
wide.select("postcode", "pp_address", "epc_address"),
|
|
active_postcodes_for_listed,
|
|
listed_buildings_path,
|
|
)
|
|
wide = wide.join(listed_flags.lazy(), on=["postcode", "pp_address"], how="left")
|
|
else:
|
|
wide = wide.with_columns(
|
|
pl.lit(None, dtype=pl.Utf8).alias(LISTED_BUILDING_FEATURE)
|
|
)
|
|
|
|
if actual_listings_path is not None:
|
|
wide = _integrate_listings(
|
|
wide,
|
|
actual_listings_path,
|
|
arcgis_path,
|
|
epc_path=actual_listings_epc_path,
|
|
)
|
|
|
|
wide = wide.with_columns(pl.col(LISTED_BUILDING_FEATURE).fill_null("No"))
|
|
|
|
arcgis = (
|
|
arcgis_raw.filter(pl.col("ctry25cd") == "E92000001") # England only
|
|
.filter(pl.col("doterm").is_null()) # Active postcodes only
|
|
# NSPL Feb 2026 renamed geographic code columns to {field}{year}cd.
|
|
# Alias them back to the short canonical names used across the
|
|
# pipeline so downstream joins don't need to know about NSPL's
|
|
# versioning scheme.
|
|
.select(
|
|
pl.col("pcds").alias("postcode"),
|
|
"lat",
|
|
pl.col("long").alias("lon"),
|
|
pl.col("lsoa21cd").alias("lsoa21"),
|
|
pl.col("oa21cd").alias("oa21"),
|
|
pl.col("pcon24cd").alias("pcon"),
|
|
)
|
|
)
|
|
wide = wide.join(arcgis, on="postcode", how="left")
|
|
|
|
iod = pl.scan_parquet(iod_path).with_columns(
|
|
*(_less_deprived_percentile_expr(c) for c in _IOD_PERCENTILE_COLUMNS)
|
|
)
|
|
wide = wide.join(iod, left_on="lsoa21", right_on="LSOA code (2021)", how="left")
|
|
|
|
ethnicity = pl.scan_parquet(ethnicity_path)
|
|
wide = wide.join(
|
|
ethnicity,
|
|
left_on="Local Authority District code (2024)",
|
|
right_on="Geography_code",
|
|
how="left",
|
|
)
|
|
|
|
# Derive bedroom count: habitable rooms - 1 (assuming 1 reception room), clipped to 0..4
|
|
wide = wide.with_columns(
|
|
(pl.col("number_habitable_rooms") - 1)
|
|
.clip(0, 4)
|
|
.cast(pl.UInt8)
|
|
.alias("_bedrooms"),
|
|
)
|
|
rental = pl.scan_parquet(rental_prices_path).select(
|
|
"area_code", "bedrooms", "mean_monthly_rent"
|
|
)
|
|
wide = wide.join(
|
|
rental,
|
|
left_on=["Local Authority District code (2024)", "_bedrooms"],
|
|
right_on=["area_code", "bedrooms"],
|
|
how="left",
|
|
)
|
|
|
|
crime = pl.scan_parquet(crime_path)
|
|
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
|
|
|
|
wide = wide.with_columns(
|
|
pl.sum_horizontal(
|
|
"Violence and sexual offences (avg/yr)",
|
|
"Robbery (avg/yr)",
|
|
"Burglary (avg/yr)",
|
|
"Possession of weapons (avg/yr)",
|
|
).alias("serious_crime_avg_yr"),
|
|
pl.sum_horizontal(
|
|
"Anti-social behaviour (avg/yr)",
|
|
"Criminal damage and arson (avg/yr)",
|
|
"Shoplifting (avg/yr)",
|
|
"Bicycle theft (avg/yr)",
|
|
"Theft from the person (avg/yr)",
|
|
"Other theft (avg/yr)",
|
|
"Vehicle crime (avg/yr)",
|
|
"Public order (avg/yr)",
|
|
"Drugs (avg/yr)",
|
|
"Other crime (avg/yr)",
|
|
).alias("minor_crime_avg_yr"),
|
|
)
|
|
|
|
lsoa_pop = pl.scan_parquet(lsoa_population_path)
|
|
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
|
|
wide = wide.with_columns(
|
|
pl.when(pl.col("population") > 0)
|
|
.then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
|
|
.alias("serious_crime_per_1k"),
|
|
pl.when(pl.col("population") > 0)
|
|
.then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
|
|
.alias("minor_crime_per_1k"),
|
|
).drop("population")
|
|
|
|
median_age = pl.scan_parquet(median_age_path)
|
|
wide = wide.join(median_age, on="lsoa21", how="left")
|
|
|
|
election = pl.scan_parquet(election_results_path)
|
|
wide = wide.join(election, on="pcon", how="left")
|
|
|
|
poi_counts = pl.scan_parquet(poi_proximity_path)
|
|
wide = wide.join(poi_counts, on="postcode", how="left")
|
|
|
|
noise_cols = ["road_noise_lden_db", "rail_noise_lden_db", "airport_noise_lden_db"]
|
|
noise = (
|
|
pl.scan_parquet(noise_path)
|
|
.with_columns(
|
|
# NaN → null so max_horizontal ignores missing instead of propagating NaN
|
|
*[pl.col(c).fill_nan(None) for c in noise_cols],
|
|
)
|
|
.with_columns(
|
|
pl.max_horizontal(*noise_cols).alias("noise_lden_db"),
|
|
)
|
|
.select("postcode", "noise_lden_db")
|
|
)
|
|
wide = wide.join(noise, on="postcode", how="left")
|
|
|
|
school_proximity = pl.scan_parquet(school_proximity_path)
|
|
wide = wide.join(school_proximity, on="postcode", how="left")
|
|
|
|
conservation_areas = _conservation_area_by_postcode(
|
|
arcgis.select("postcode", "lat", "lon"), conservation_areas_path
|
|
)
|
|
wide = wide.join(conservation_areas, on="postcode", how="left").with_columns(
|
|
pl.col(CONSERVATION_AREA_FEATURE).fill_null("No")
|
|
)
|
|
|
|
if tree_density_postcodes_path is not None:
|
|
tree_density = _tree_density_by_postcode(tree_density_postcodes_path)
|
|
wide = wide.join(tree_density, on="postcode", how="left")
|
|
|
|
# Broadband: derive max available download speed tier per postcode from
|
|
# Ofcom availability percentages. Tiers: Gigabit ≥1000, UFBB ≥300,
|
|
# UFBB(100) ≥100, SFBB ≥30 Mbps. Stored as string enum.
|
|
broadband = (
|
|
pl.scan_parquet(broadband_path)
|
|
.select(
|
|
pl.col("postcode_space").alias("bb_postcode"),
|
|
pl.when(pl.col("Gigabit availability (% premises)") > 0)
|
|
.then(1000)
|
|
.when(pl.col("UFBB availability (% premises)") > 0)
|
|
.then(300)
|
|
.when(pl.col("UFBB (100Mbit/s) availability (% premises)") > 0)
|
|
.then(100)
|
|
.when(pl.col("SFBB availability (% premises)") > 0)
|
|
.then(30)
|
|
.otherwise(10)
|
|
.cast(pl.UInt16)
|
|
.alias("max_download_speed"),
|
|
)
|
|
.group_by("bb_postcode")
|
|
.agg(pl.col("max_download_speed").max())
|
|
.with_columns(pl.col("max_download_speed").cast(pl.Utf8))
|
|
)
|
|
wide = wide.join(broadband, left_on="postcode", right_on="bb_postcode", how="left")
|
|
|
|
# Derive property_type: prefer EPC data, fall back to price-paid.
|
|
# For Houses, use built_form (e.g. Semi-Detached, Mid-Terrace) for finer detail.
|
|
bad_built_form = pl.col("built_form").is_null() | pl.col("built_form").is_in(
|
|
["NO DATA!", "Not Recorded"]
|
|
)
|
|
has_epc = pl.col("epc_property_type").is_not_null()
|
|
is_house = pl.col("epc_property_type") == "House"
|
|
wide = wide.with_columns(
|
|
pl.when(has_epc & is_house & ~bad_built_form)
|
|
.then(pl.col("built_form"))
|
|
.when(has_epc & is_house)
|
|
.then(pl.col("pp_property_type"))
|
|
.when(has_epc)
|
|
.then(pl.col("epc_property_type"))
|
|
.otherwise(pl.col("pp_property_type"))
|
|
# Unify EPC's "Flat"/"Maisonette" with price-paid's "Flats/Maisonettes",
|
|
# collapse terrace sub-types, and fold rare types into "Other"
|
|
.replace(
|
|
{
|
|
"Flat": "Flats/Maisonettes",
|
|
"Maisonette": "Flats/Maisonettes",
|
|
"End-Terrace": "Terraced",
|
|
"Mid-Terrace": "Terraced",
|
|
"Enclosed End-Terrace": "Terraced",
|
|
"Enclosed Mid-Terrace": "Terraced",
|
|
"Bungalow": "Other",
|
|
"Park home": "Other",
|
|
}
|
|
)
|
|
.alias("property_type")
|
|
)
|
|
|
|
wide = (
|
|
wide.with_columns(
|
|
pl.when(pl.col("duration") == "U")
|
|
.then(None)
|
|
.otherwise(pl.col("duration"))
|
|
.alias("duration"),
|
|
pl.when(pl.col("current_energy_rating") == "INVALID!")
|
|
.then(None)
|
|
.otherwise(pl.col("current_energy_rating"))
|
|
.alias("current_energy_rating"),
|
|
)
|
|
.with_columns(
|
|
(pl.col("latest_price") / pl.col("total_floor_area"))
|
|
.round(0)
|
|
.cast(pl.Int32)
|
|
.alias("Price per sqm"),
|
|
)
|
|
.drop(
|
|
"inspection_date",
|
|
"_bedrooms",
|
|
"LSOA name (2021)",
|
|
"Local Authority District code (2024)",
|
|
"Local Authority District name (2024)",
|
|
"Wider Barriers Sub-domain Score",
|
|
"Geographical Barriers Sub-domain Score",
|
|
"Adult Skills Sub-domain Score",
|
|
"Children and Young People Sub-domain Score",
|
|
"Crime Score",
|
|
"Living Environment Score",
|
|
"Index of Multiple Deprivation (IMD) Score",
|
|
"Income Deprivation Affecting Older People (IDAOPI) Score (rate)",
|
|
"Income Deprivation Affecting Children Index (IDACI) Score (rate)",
|
|
"Barriers to Housing and Services Score",
|
|
"oa21",
|
|
"pcon",
|
|
"epc_property_type",
|
|
"pp_property_type",
|
|
"built_form",
|
|
)
|
|
.rename(
|
|
{
|
|
"date_of_transfer": "Date of last transaction",
|
|
"construction_age_band": "Construction year",
|
|
"is_construction_date_approximate": "Is construction date approximate",
|
|
"Income Score (rate)": "Income Score",
|
|
"Employment Score (rate)": "Employment Score",
|
|
"Indoors Sub-domain Score": "Housing Conditions Score",
|
|
"Outdoors Sub-domain Score": "Air Quality and Road Safety Score",
|
|
"pp_address": "Address per Property Register",
|
|
"epc_address": "Address per EPC",
|
|
"postcode": "Postcode",
|
|
"duration": "Leasehold/Freehold",
|
|
"current_energy_rating": "Current energy rating",
|
|
"potential_energy_rating": "Potential energy rating",
|
|
"total_floor_area": "Total floor area (sqm)",
|
|
"property_type": "Property type",
|
|
"restaurants_2km": "Number of restaurants within 2km",
|
|
"groceries_2km": "Number of grocery shops and supermarkets within 2km",
|
|
"latest_price": "Last known price",
|
|
"number_habitable_rooms": "Number of bedrooms & living rooms",
|
|
"noise_lden_db": "Noise (dB)",
|
|
"good_primary_5km": "Good+ primary schools within 5km",
|
|
"good_secondary_5km": "Good+ secondary schools within 5km",
|
|
"good_primary_2km": "Good+ primary schools within 2km",
|
|
"good_secondary_2km": "Good+ secondary schools within 2km",
|
|
"outstanding_primary_5km": "Outstanding primary schools within 5km",
|
|
"outstanding_secondary_5km": "Outstanding secondary schools within 5km",
|
|
"outstanding_primary_2km": "Outstanding primary schools within 2km",
|
|
"outstanding_secondary_2km": "Outstanding secondary schools within 2km",
|
|
"max_download_speed": "Max available download speed (Mbps)",
|
|
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
|
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
|
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
|
|
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
|
|
"mean_monthly_rent": "Estimated monthly rent",
|
|
"floor_height": "Interior height (m)",
|
|
"was_council_house": "Former council house",
|
|
"median_age": "Median age",
|
|
"turnout_pct": "Voter turnout (%)",
|
|
}
|
|
)
|
|
)
|
|
|
|
print("Collecting with streaming engine...")
|
|
df = wide.collect(engine="streaming")
|
|
|
|
if mode == "listings":
|
|
enriched_listings = _finalize_listings(df)
|
|
_validate_property_postcodes(enriched_listings)
|
|
print(f"Enriched listings rows: {enriched_listings.height}")
|
|
return _BuildResult(listings=enriched_listings)
|
|
|
|
_validate_property_postcodes(df)
|
|
|
|
# Split into postcode-level and property-level dataframes
|
|
area_cols = [
|
|
c for c in df.columns if c in _AREA_COLUMNS or _is_dynamic_poi_metric_column(c)
|
|
]
|
|
postcode_df = df.select(area_cols).group_by("Postcode").first()
|
|
print(f"Postcode rows: {postcode_df.height} (unique postcodes)")
|
|
|
|
property_cols = [
|
|
c
|
|
for c in df.columns
|
|
if (c not in _AREA_COLUMNS and not _is_dynamic_poi_metric_column(c))
|
|
or c == "Postcode"
|
|
]
|
|
properties_df = df.select(property_cols)
|
|
print(f"Property rows: {properties_df.height}")
|
|
|
|
return _BuildResult(postcode=postcode_df, properties=properties_df)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Build wide property dataframe with all joins"
|
|
)
|
|
parser.add_argument(
|
|
"--epc-pp", type=Path, required=True, help="EPC-Price Paid joined parquet file"
|
|
)
|
|
parser.add_argument(
|
|
"--arcgis", type=Path, required=True, help="ArcGIS postcode data parquet file"
|
|
)
|
|
parser.add_argument(
|
|
"--iod",
|
|
type=Path,
|
|
required=True,
|
|
help="Index of Deprivation parquet file (optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--poi-proximity",
|
|
type=Path,
|
|
help="POI proximity counts parquet file (optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--ethnicity",
|
|
type=Path,
|
|
required=True,
|
|
help="Ethnicity by local authority parquet file (optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--crime",
|
|
type=Path,
|
|
required=True,
|
|
help="Crime by LSOA parquet file (optional)",
|
|
)
|
|
parser.add_argument(
|
|
"--noise", type=Path, required=True, help="Road noise by postcode parquet file"
|
|
)
|
|
parser.add_argument(
|
|
"--school-proximity",
|
|
type=Path,
|
|
required=True,
|
|
help="School proximity counts parquet file",
|
|
)
|
|
parser.add_argument(
|
|
"--broadband",
|
|
type=Path,
|
|
required=True,
|
|
help="Broadband performance by output area parquet file",
|
|
)
|
|
parser.add_argument(
|
|
"--conservation-areas",
|
|
type=Path,
|
|
required=True,
|
|
help="Planning Data conservation areas GeoJSON",
|
|
)
|
|
parser.add_argument(
|
|
"--listed-buildings",
|
|
type=Path,
|
|
required=False,
|
|
help="Historic England NHLE listed-building points GeoPackage",
|
|
)
|
|
parser.add_argument(
|
|
"--rental-prices",
|
|
type=Path,
|
|
required=True,
|
|
help="ONS rental prices by LA and bedroom count parquet file",
|
|
)
|
|
parser.add_argument(
|
|
"--lsoa-population",
|
|
type=Path,
|
|
required=True,
|
|
help="Census 2021 population by LSOA parquet file",
|
|
)
|
|
parser.add_argument(
|
|
"--median-age",
|
|
type=Path,
|
|
required=True,
|
|
help="Census 2021 median age by LSOA parquet file",
|
|
)
|
|
parser.add_argument(
|
|
"--election-results",
|
|
type=Path,
|
|
required=True,
|
|
help="2024 General Election results by constituency parquet file",
|
|
)
|
|
parser.add_argument(
|
|
"--tree-density-postcodes",
|
|
type=Path,
|
|
required=False,
|
|
help="Postcode-level tree density parquet from pipeline.transform.tree_density",
|
|
)
|
|
parser.add_argument(
|
|
"--output-postcodes",
|
|
type=Path,
|
|
required=False,
|
|
help="Output postcode parquet (normal mode only)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-properties",
|
|
type=Path,
|
|
required=False,
|
|
help="Output properties parquet (normal mode only)",
|
|
)
|
|
parser.add_argument(
|
|
"--actual-listings",
|
|
type=Path,
|
|
required=False,
|
|
help=(
|
|
"Optional scraped-listings parquet. When provided, listings flow "
|
|
"through the same merge pipeline as historical properties — set "
|
|
"--output-listings to write the enriched-listings file instead "
|
|
"of the postcode/properties files."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--epc",
|
|
type=Path,
|
|
required=False,
|
|
help=(
|
|
"Raw EPC certificates CSV or zip. Used only with --actual-listings "
|
|
"to match live listings directly to EPC records."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--output-listings",
|
|
type=Path,
|
|
required=False,
|
|
help=(
|
|
"Output enriched-listings parquet path. Required (and only valid) "
|
|
"when --actual-listings is set; --output-postcodes and "
|
|
"--output-properties are ignored in this mode."
|
|
),
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
listings_mode = args.actual_listings is not None
|
|
if listings_mode and args.output_listings is None:
|
|
parser.error("--output-listings is required when --actual-listings is set")
|
|
if not listings_mode and (
|
|
args.output_postcodes is None or args.output_properties is None
|
|
):
|
|
parser.error(
|
|
"--output-postcodes and --output-properties are required in normal mode"
|
|
)
|
|
|
|
result = _build(
|
|
epc_pp_path=args.epc_pp,
|
|
arcgis_path=args.arcgis,
|
|
iod_path=args.iod,
|
|
poi_proximity_path=args.poi_proximity,
|
|
ethnicity_path=args.ethnicity,
|
|
crime_path=args.crime,
|
|
noise_path=args.noise,
|
|
school_proximity_path=args.school_proximity,
|
|
broadband_path=args.broadband,
|
|
conservation_areas_path=args.conservation_areas,
|
|
rental_prices_path=args.rental_prices,
|
|
lsoa_population_path=args.lsoa_population,
|
|
median_age_path=args.median_age,
|
|
election_results_path=args.election_results,
|
|
tree_density_postcodes_path=args.tree_density_postcodes,
|
|
listed_buildings_path=args.listed_buildings,
|
|
actual_listings_path=args.actual_listings,
|
|
actual_listings_epc_path=args.epc if listings_mode else None,
|
|
mode="listings" if listings_mode else "normal",
|
|
)
|
|
|
|
if listings_mode:
|
|
listings_df = result.listings
|
|
assert listings_df is not None # guaranteed by mode contract
|
|
args.output_listings.parent.mkdir(parents=True, exist_ok=True)
|
|
listings_df.write_parquet(args.output_listings)
|
|
size_mb = args.output_listings.stat().st_size / (1024 * 1024)
|
|
print(
|
|
f"\nEnriched listings: {listings_df.height} rows, "
|
|
f"{len(listings_df.columns)} columns"
|
|
)
|
|
print(f"Wrote {args.output_listings} ({size_mb:.1f} MB)")
|
|
return
|
|
|
|
postcode_df = result.postcode
|
|
properties_df = result.properties
|
|
assert postcode_df is not None and properties_df is not None
|
|
|
|
print(f"\nPostcode columns: {postcode_df.columns}")
|
|
print(f"Postcode rows: {postcode_df.height}")
|
|
postcode_df.write_parquet(args.output_postcodes)
|
|
size_mb = args.output_postcodes.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {args.output_postcodes} ({size_mb:.1f} MB)")
|
|
|
|
print(f"\nProperty columns: {properties_df.columns}")
|
|
print(f"Property rows: {properties_df.height}")
|
|
properties_df.write_parquet(args.output_properties)
|
|
size_mb = args.output_properties.stat().st_size / (1024 * 1024)
|
|
print(f"Wrote {args.output_properties} ({size_mb:.1f} MB)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|