scraping and data

This commit is contained in:
Andras Schmelczer 2026-05-31 15:36:33 +01:00
parent d98819b569
commit 8688b7475e
43 changed files with 4920 additions and 531 deletions

View file

@ -48,7 +48,7 @@ _AREA_COLUMNS = [
"lon",
# Runtime provenance for deciding whether missing coordinates are skippable.
"ctry25cd",
# Keyed lookup for postcode-level side tables (e.g. crime time series).
# Join key for LSOA-level side tables (e.g. median age).
"lsoa21",
# Deprivation
"Income Score",
@ -81,8 +81,6 @@ _AREA_COLUMNS = [
"Other crime (avg/yr)",
"Serious crime (avg/yr)",
"Minor crime (avg/yr)",
"Serious crime per 1k residents (avg/yr)",
"Minor crime per 1k residents (avg/yr)",
# Amenities
"Number of restaurants within 2km",
"Number of grocery shops and supermarkets within 2km",
@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [
"Other",
]
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
_PROPERTY_MATCH_MIN_MARGIN = 4.0
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
_DIRECT_EPC_NEAREST_POSTCODES = 40
# Listings are matched to EPC certificates and Price-Paid properties first by
# UPRN (exact) and otherwise by fuzzy street-address similarity within the same
# postcode. A house number in the listing address is the strong disambiguator,
# so a numbered listing may match on a lower street-similarity score than a
# number-less one (which must match the street almost exactly to be trusted).
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
("_direct_epc_address", pl.Utf8),
("_direct_current_energy_rating", pl.Utf8),
@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
("_direct_was_council_house", pl.Utf8),
("_direct_epc_match_status", pl.Utf8),
("_direct_epc_match_score", pl.Float32),
("_direct_epc_match_margin", pl.Float32),
("_direct_epc_match_method", pl.Utf8),
)
_DIRECT_EPC_RAW_COLUMN_MAP = {
"epc_address": "_direct_epc_address",
@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
)
def _ratio_bonus(
left: float | int | None, right: float | int | None, pct: float, cap: float
) -> float:
if left is None or right is None:
return 0.0
try:
left_f = float(left)
right_f = float(right)
except (TypeError, ValueError):
return 0.0
if left_f <= 0 or right_f <= 0:
return 0.0
rel = abs(left_f - right_f) / max(left_f, right_f)
if rel > pct:
return 0.0
return cap * (1.0 - rel / pct)
def _rooms_bonus(left: int | None, right: int | None) -> float:
if left is None or right is None:
return 0.0
try:
diff = abs(int(left) - int(right))
except (TypeError, ValueError):
return 0.0
if diff == 0:
return 4.0
if diff == 1:
return 2.0
return 0.0
def _enum_bonus(
left: str | None, right: str | None, *, exact: float, mismatch: float
) -> float:
if not left or not right:
return 0.0
return exact if left == right else mismatch
def _address_score(query: str, candidate: str | None) -> int:
if not candidate:
return 0
@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool:
return bool(address and _NUMBER_RE.search(address))
def _normalize_uprn(value: object) -> str | None:
"""Canonical UPRN string (digits only) or None.
UPRNs arrive as strings or ints from the scraper / EPC register; normalise
so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
silently mangled into a bogus all-digits key.
"""
if value is None:
return None
if isinstance(value, float):
if not value.is_integer():
return None
value = int(value)
digits = re.sub(r"\D", "", str(value))
return digits or None
def _best_listing_match(
listing_uprn: str | None,
query: str | None,
uprn_index: dict[str, dict],
bucket_candidates: list[dict],
addressed_fields: list[str],
) -> tuple[dict, float, str, str | None] | None:
"""Pick the best candidate for a listing.
Matching is, in order: (1) an exact UPRN equality against the global
``uprn_index`` (postcode-independent, so it is robust even when the
listing's postcode is slightly off); (2) failing that, the highest
fuzzy street-address similarity within the listing's own postcode bucket.
No property-attribute heuristics are used a house number in the listing
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
threshold; a number-less address must match the street almost exactly.
``addressed_fields`` names the candidate columns to fuzzy-match against (a
candidate may carry both a register and an EPC address). Returns
``(candidate, score, method, matched_field)`` or None. ``method`` is
"uprn" or "address"; ``matched_field`` is the winning address column (or
None for a UPRN match).
"""
if listing_uprn:
hit = uprn_index.get(listing_uprn)
if hit is not None:
return hit, 100.0, "uprn", None
if not query:
return None
listing_has_numbers = _has_number(query)
best: dict | None = None
best_score = 0
best_field: str | None = None
for candidate in bucket_candidates:
for field in addressed_fields:
address = candidate.get(field)
if not address:
continue
if listing_has_numbers and not _numbers_compatible(query, address):
continue
score = _address_score(query, address)
if score > best_score:
best_score = score
best = candidate
best_field = field
if best is None:
return None
threshold = (
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if best_score < threshold:
return None
return best, float(best_score), "address", best_field
def _load_listings_for_merge(
listings_path: Path, arcgis_path: Path
) -> pl.DataFrame:
@ -908,6 +942,20 @@ def _load_listings_for_merge(
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
# UPRN is only present on scraped listings that carry it (Zoopla detail
# pages); tolerate its absence so older parquets and test fixtures still
# load. Digits-only so it compares equal to the EPC register's UPRN.
if "UPRN" in raw.collect_schema().names():
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
listing_uprn_expr = (
pl.when(uprn_digits.str.len_chars() > 0)
.then(uprn_digits)
.otherwise(None)
.alias("_listing_uprn")
)
else:
listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
# treats NaN as distinct from null and the downstream `latest_price /
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
@ -936,12 +984,14 @@ def _load_listings_for_merge(
"postcode"
),
pl.col("Address per Property Register").alias("pp_address"),
listing_uprn_expr,
*overlay,
)
.select(
"_listing_idx",
"postcode",
"pp_address",
"_listing_uprn",
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
)
.collect(engine="streaming")
@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame:
def _load_direct_epc_candidates(
epc_path: Path,
arcgis_path: Path,
listing_outcodes: list[str],
temp_dir: Path,
) -> pl.DataFrame:
@ -982,8 +1031,7 @@ def _load_direct_epc_candidates(
"_direct_epc_match_postcode": pl.Utf8,
"_direct_epc_outcode": pl.Utf8,
"_direct_epc_canonical_property_type": pl.Utf8,
"_direct_epc_east": pl.Float64,
"_direct_epc_north": pl.Float64,
"_direct_epc_uprn": pl.Utf8,
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
}
if not listing_outcodes:
@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates(
.with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
)
arcgis = pl.scan_parquet(arcgis_path).select(
normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
pl.col("east1m").alias("_direct_epc_east"),
pl.col("north1m").alias("_direct_epc_north"),
)
return (
epc_base.sort("inspection_date", descending=True)
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates(
on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
how="left",
)
.join(arcgis, on="_direct_epc_match_postcode", how="left")
.with_columns(
_canonical_epc_property_type_expr().alias(
"_direct_epc_canonical_property_type"
@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates(
.otherwise(None)
.alias("_direct_potential_energy_rating"),
pl.col("epc_address").alias("_direct_epc_address"),
pl.col("uprn").alias("_direct_epc_uprn"),
pl.col("total_floor_area").alias("_direct_total_floor_area"),
pl.col("number_habitable_rooms").alias(
"_direct_number_habitable_rooms"
@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates(
"_direct_epc_match_postcode",
"_direct_epc_outcode",
"_direct_epc_canonical_property_type",
"_direct_epc_east",
"_direct_epc_north",
"_direct_epc_uprn",
"_direct_epc_address",
"_direct_current_energy_rating",
"_direct_potential_energy_rating",
@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates(
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
match = listings.with_columns(
"""Add the normalised address/postcode/outcode keys used to match listings.
Listings are matched to EPC certificates and properties by UPRN and by
fuzzy street address within their (now accurate, detail-page-sourced)
postcode never by coordinate proximity so no projected easting/northing
is computed here. `_listing_uprn` flows through from the loaded listings.
"""
return listings.with_columns(
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
).with_columns(
@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
.alias("_listing_outcode")
)
if match.is_empty():
return match.with_columns(
pl.Series("_listing_east", [], dtype=pl.Float64),
pl.Series("_listing_north", [], dtype=pl.Float64),
)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
east, north = transformer.transform(
match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
)
return match.with_columns(
pl.Series("_listing_east", east, dtype=pl.Float64),
pl.Series("_listing_north", north, dtype=pl.Float64),
)
def _optional_lazy_col(
schema: pl.Schema, column: str, dtype: pl.DataType
@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]:
"_matched_postcode": pl.Utf8,
"_matched_pp_address": pl.Utf8,
"_property_match_score": pl.Float32,
"_property_match_address_score": pl.Int32,
"_property_match_margin": pl.Float32,
"_property_match_method": pl.Utf8,
"_property_match_field": pl.Utf8,
}
@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
pl.col("postcode").cast(pl.Utf8).alias("postcode"),
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
_optional_lazy_col(schema, "epc_address", pl.Utf8),
_optional_lazy_col(schema, "pp_property_type", pl.Utf8),
_optional_lazy_col(schema, "duration", pl.Utf8),
_optional_lazy_col(schema, "total_floor_area", pl.Float64),
_optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
_optional_lazy_col(schema, "latest_price", pl.Int64),
# UPRN keys the exact match; present once epc_pp is rebuilt with it.
_optional_lazy_col(schema, "uprn", pl.Utf8),
)
.with_row_index("_property_row")
.with_columns(
@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
)
def _property_candidates_by_postcode(
candidates: pl.DataFrame,
) -> dict[str, list[dict]]:
def _index_candidates(
candidates: pl.DataFrame, postcode_key: str, uprn_key: str
) -> tuple[dict[str, list[dict]], dict[str, dict]]:
"""Index candidate rows for matching, in a single pass over the frame.
Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
fuzzy street-address match; the UPRN index drives the exact match and is
postcode-independent, so it still resolves when a listing's postcode is
slightly off.
"""
buckets: dict[str, list[dict]] = {}
uprn_index: dict[str, dict] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_property_match_postcode")
postcode = row.get(postcode_key)
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
uprn = _normalize_uprn(row.get(uprn_key))
if uprn and uprn not in uprn_index:
uprn_index[uprn] = row
return buckets, uprn_index
def _best_listing_property_candidate(
listing: dict, candidates: list[dict]
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict, str]] = []
for candidate in candidates:
register_address = candidate.get("_property_match_address")
epc_address = candidate.get("_property_epc_match_address")
register_numbers_compatible = bool(
register_address and _numbers_compatible(query, register_address)
)
epc_numbers_compatible = bool(
epc_address and _numbers_compatible(query, epc_address)
)
if not (register_numbers_compatible or epc_numbers_compatible):
continue
register_score = _address_score(query, register_address)
epc_score = _address_score(query, epc_address)
base_score = max(register_score, epc_score)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("_actual_property_type"),
candidate.get("pp_property_type"),
exact=7.0,
mismatch=-8.0,
)
score += _enum_bonus(
listing.get("_actual_leasehold_freehold"),
candidate.get("duration"),
exact=3.0,
mismatch=-3.0,
)
score += _ratio_bonus(
listing.get("_actual_total_floor_area"),
candidate.get("total_floor_area"),
pct=0.15,
cap=8.0,
)
score += _rooms_bonus(
listing.get("_actual_number_habitable_rooms"),
candidate.get("number_habitable_rooms"),
)
score += _ratio_bonus(
listing.get("_actual_asking_price"),
candidate.get("latest_price"),
pct=0.25,
cap=3.0,
)
matched_field = (
"pp_address" if register_score >= epc_score else "epc_address"
)
scored.append((score, base_score, candidate, matched_field))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
score_threshold = (
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
result = _best_listing_match(
listing.get("_listing_uprn"),
listing.get("_listing_match_address"),
uprn_index,
candidates,
["_property_match_address", "_property_epc_match_address"],
)
address_threshold = (
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
if listing_has_numbers
else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
)
if (
top[0] < score_threshold
or top[1] < address_threshold
or margin < _PROPERTY_MATCH_MIN_MARGIN
):
if result is None:
return None
candidate = top[2]
candidate, score, method, field = result
matched_field = {
"_property_match_address": "pp_address",
"_property_epc_match_address": "epc_address",
}.get(field, method)
return {
"_listing_idx": listing["_listing_idx"],
"_matched_postcode": candidate.get("postcode"),
"_matched_pp_address": candidate.get("pp_address"),
"_property_match_score": round(top[0], 1),
"_property_match_address_score": top[1],
"_property_match_margin": round(margin, 1),
"_property_match_field": top[3],
"_property_match_score": round(score, 1),
"_property_match_method": method,
"_property_match_field": matched_field,
}
@ -1280,23 +1251,32 @@ def _match_listing_properties(
if listing_matches.is_empty() or property_candidates.is_empty():
return _empty_listing_property_matches()
buckets = _property_candidates_by_postcode(property_candidates)
buckets, uprn_index = _index_candidates(
property_candidates, "_property_match_postcode", "uprn"
)
best_matches = []
for listing in listing_matches.iter_rows(named=True):
postcode = listing.get("_listing_match_postcode")
if not postcode:
continue
match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
bucket = buckets.get(postcode, []) if postcode else []
match = _best_listing_property_candidate(listing, uprn_index, bucket)
if match is not None:
best_matches.append(match)
if not best_matches:
return _empty_listing_property_matches()
# When two listings claim the same property, keep the most authoritative
# match: an exact UPRN match always wins over a fuzzy address match (both can
# score 100, so method must break the tie before score and listing index).
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
return (
matches.sort(
["_property_match_score", "_listing_idx"], descending=[True, False]
[
pl.col("_property_match_method") == "uprn",
"_property_match_score",
"_listing_idx",
],
descending=[True, True, False],
)
.unique(
["_matched_postcode", "_matched_pp_address"],
@ -1307,133 +1287,19 @@ def _match_listing_properties(
)
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
buckets: dict[str, list[dict]] = {}
for row in candidates.iter_rows(named=True):
postcode = row.get("_direct_epc_match_postcode")
if postcode:
buckets.setdefault(postcode, []).append(row)
return buckets
def _epc_postcode_tree(
candidates: pl.DataFrame,
) -> tuple[cKDTree | None, list[str]]:
postcode_points = (
candidates.select(
"_direct_epc_match_postcode",
"_direct_epc_east",
"_direct_epc_north",
)
.drop_nulls()
.filter(
pl.col("_direct_epc_east").is_finite()
& pl.col("_direct_epc_north").is_finite()
)
.unique("_direct_epc_match_postcode")
def _best_direct_epc_candidate(
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
) -> dict | None:
result = _best_listing_match(
listing.get("_listing_uprn"),
listing.get("_listing_match_address"),
uprn_index,
candidates,
["_direct_epc_match_address"],
)
if postcode_points.is_empty():
return None, []
coords = np.column_stack(
[
postcode_points["_direct_epc_east"].to_numpy(),
postcode_points["_direct_epc_north"].to_numpy(),
]
)
return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
def _candidate_postcodes_for_listing(
listing: dict,
postcode_tree: cKDTree | None,
postcode_values: list[str],
) -> list[str]:
postcodes: list[str] = []
exact = listing.get("_listing_match_postcode")
if exact:
postcodes.append(exact)
if postcode_tree is None:
return postcodes
east = listing.get("_listing_east")
north = listing.get("_listing_north")
try:
east_f = float(east)
north_f = float(north)
except (TypeError, ValueError):
return postcodes
if not np.isfinite(east_f) or not np.isfinite(north_f):
return postcodes
k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
distances, indices = postcode_tree.query(
[east_f, north_f],
k=k,
distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
)
distances = np.atleast_1d(distances)
indices = np.atleast_1d(indices)
seen = set(postcodes)
for distance, idx in zip(distances, indices, strict=False):
if not np.isfinite(distance) or idx >= len(postcode_values):
continue
postcode = postcode_values[int(idx)]
if postcode not in seen:
postcodes.append(postcode)
seen.add(postcode)
return postcodes
def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
query = listing.get("_listing_match_address")
if not query:
if result is None:
return None
listing_has_numbers = _has_number(query)
scored: list[tuple[float, int, dict]] = []
for candidate in candidates:
address = candidate.get("_direct_epc_match_address")
if listing_has_numbers and not _numbers_compatible(query, address or ""):
continue
base_score = _address_score(query, address)
if base_score == 0:
continue
score = float(base_score)
score += _enum_bonus(
listing.get("_actual_property_type"),
candidate.get("_direct_epc_canonical_property_type"),
exact=6.0,
mismatch=-6.0,
)
score += _ratio_bonus(
listing.get("_actual_total_floor_area"),
candidate.get("_direct_total_floor_area"),
pct=0.12,
cap=8.0,
)
score += _rooms_bonus(
listing.get("_actual_number_habitable_rooms"),
candidate.get("_direct_number_habitable_rooms"),
)
scored.append((score, base_score, candidate))
if not scored:
return None
scored.sort(key=lambda item: item[0], reverse=True)
top = scored[0]
runner_up = scored[1][0] if len(scored) > 1 else None
margin = top[0] - runner_up if runner_up is not None else top[0]
threshold = (
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
if listing_has_numbers
else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
)
if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
return None
candidate = top[2]
candidate, score, method, _field = result
return {
"_listing_idx": listing["_listing_idx"],
"_direct_epc_address": candidate.get("_direct_epc_address"),
@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict |
),
"_direct_was_council_house": candidate.get("_direct_was_council_house"),
"_direct_epc_match_status": "matched",
"_direct_epc_match_score": round(top[0], 1),
"_direct_epc_match_margin": round(margin, 1),
"_direct_epc_match_score": round(score, 1),
"_direct_epc_match_method": method,
}
@ -1463,25 +1329,14 @@ def _match_direct_epc(
if listing_matches.is_empty() or epc_candidates.is_empty():
return _empty_direct_epc_matches()
buckets = _epc_candidates_by_postcode(epc_candidates)
postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)
buckets, uprn_index = _index_candidates(
epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
)
matches = []
for listing in listing_matches.iter_rows(named=True):
candidate_postcodes = _candidate_postcodes_for_listing(
listing, postcode_tree, postcode_values
)
candidate_rows: list[dict] = []
seen_rows: set[int] = set()
for postcode in candidate_postcodes:
for candidate in buckets.get(postcode, []):
row = candidate.get("_direct_epc_row")
if row in seen_rows:
continue
candidate_rows.append(candidate)
if row is not None:
seen_rows.add(row)
match = _best_direct_epc_candidate(listing, candidate_rows)
postcode = listing.get("_listing_match_postcode")
bucket = buckets.get(postcode, []) if postcode else []
match = _best_direct_epc_candidate(listing, uprn_index, bucket)
if match is not None:
matches.append(match)
@ -1493,7 +1348,6 @@ def _match_direct_epc(
def _enrich_listings_with_direct_epc(
listings: pl.DataFrame,
epc_path: Path | None,
arcgis_path: Path,
) -> pl.DataFrame:
if epc_path is None:
return _ensure_direct_epc_columns(listings)
@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc(
prefix="direct_listing_epc_", dir=local_tmp_dir()
) as tmpdir:
epc_candidates = _load_direct_epc_candidates(
epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
epc_path, listing_outcodes, Path(tmpdir)
)
print(f"Direct listing EPC candidates: {epc_candidates.height}")
direct_matches = _match_direct_epc(listing_matches, epc_candidates)
@ -1604,7 +1458,7 @@ def _integrate_listings(
"""
listings = _load_listings_for_merge(listings_path, arcgis_path)
print(f"Listings loaded: {listings.height}")
listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)
listings = _enrich_listings_with_direct_epc(listings, epc_path)
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
listing_attachment_columns = [
@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
"""Project the post-rename wide frame down to enriched-listing rows."""
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
# A matched listing's overlay attaches to every wide row sharing its
# (postcode, pp_address). The terminated-postcode remap can collapse several
# distinct wide rows onto one such key, which would otherwise emit one duplicate
# listing per collapsed row. Each listing matches exactly one (postcode,
# pp_address) and each seed row carries a unique URL, so keeping a single row per
# listing URL collapses only that fan-out and never merges distinct listings.
df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
df = df.with_columns(
pl.col("_actual_listing_url").alias("Listing URL"),
pl.col("_actual_listing_date").alias("Listing date"),
@ -1750,7 +1612,6 @@ def _build(
broadband_path: Path,
conservation_areas_path: Path,
rental_prices_path: Path,
lsoa_population_path: Path,
median_age_path: Path,
election_results_path: Path,
tree_density_postcodes_path: Path | None = None,
@ -1881,8 +1742,10 @@ def _build(
how="left",
)
# Crime is counted spatially per postcode (incidents within 50m of the
# postcode boundary), so it joins on postcode rather than LSOA.
crime = pl.scan_parquet(crime_path)
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
wide = wide.join(crime, on="postcode", how="left")
wide = wide.with_columns(
pl.sum_horizontal(
@ -1905,17 +1768,6 @@ def _build(
).alias("minor_crime_avg_yr"),
)
lsoa_pop = pl.scan_parquet(lsoa_population_path)
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
wide = wide.with_columns(
pl.when(pl.col("population") > 0)
.then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
.alias("serious_crime_per_1k"),
pl.when(pl.col("population") > 0)
.then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
.alias("minor_crime_per_1k"),
).drop("population")
median_age = pl.scan_parquet(median_age_path)
wide = wide.join(median_age, on="lsoa21", how="left")
@ -2082,8 +1934,6 @@ def _build(
"max_download_speed": "Max available download speed (Mbps)",
"serious_crime_avg_yr": "Serious crime (avg/yr)",
"minor_crime_avg_yr": "Minor crime (avg/yr)",
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
"mean_monthly_rent": "Estimated monthly rent",
"floor_height": "Interior height (m)",
"was_council_house": "Former council house",
@ -2189,12 +2039,6 @@ def main():
required=True,
help="ONS rental prices by LA and bedroom count parquet file",
)
parser.add_argument(
"--lsoa-population",
type=Path,
required=True,
help="Census 2021 population by LSOA parquet file",
)
parser.add_argument(
"--median-age",
type=Path,
@ -2279,7 +2123,6 @@ def main():
broadband_path=args.broadband,
conservation_areas_path=args.conservation_areas,
rental_prices_path=args.rental_prices,
lsoa_population_path=args.lsoa_population,
median_age_path=args.median_age,
election_results_path=args.election_results,
tree_density_postcodes_path=args.tree_density_postcodes,