scraping and data
This commit is contained in:
parent
d98819b569
commit
8688b7475e
43 changed files with 4920 additions and 531 deletions
|
|
@ -48,7 +48,7 @@ _AREA_COLUMNS = [
|
|||
"lon",
|
||||
# Runtime provenance for deciding whether missing coordinates are skippable.
|
||||
"ctry25cd",
|
||||
# Keyed lookup for postcode-level side tables (e.g. crime time series).
|
||||
# Join key for LSOA-level side tables (e.g. median age).
|
||||
"lsoa21",
|
||||
# Deprivation
|
||||
"Income Score",
|
||||
|
|
@ -81,8 +81,6 @@ _AREA_COLUMNS = [
|
|||
"Other crime (avg/yr)",
|
||||
"Serious crime (avg/yr)",
|
||||
"Minor crime (avg/yr)",
|
||||
"Serious crime per 1k residents (avg/yr)",
|
||||
"Minor crime per 1k residents (avg/yr)",
|
||||
# Amenities
|
||||
"Number of restaurants within 2km",
|
||||
"Number of grocery shops and supermarkets within 2km",
|
||||
|
|
@ -742,16 +740,13 @@ _PROPERTY_TYPE_VALUES = [
|
|||
"Other",
|
||||
]
|
||||
_EPC_RATING_VALUES = ["A", "B", "C", "D", "E", "F", "G"]
|
||||
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
||||
_PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
||||
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS = 82
|
||||
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS = 96
|
||||
_PROPERTY_MATCH_MIN_MARGIN = 4.0
|
||||
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS = 82.0
|
||||
_DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 96.0
|
||||
_DIRECT_EPC_MATCH_MIN_MARGIN = 4.0
|
||||
_DIRECT_EPC_NEARBY_RADIUS_M = 500.0
|
||||
_DIRECT_EPC_NEAREST_POSTCODES = 40
|
||||
# Listings are matched to EPC certificates and Price-Paid properties first by
|
||||
# UPRN (exact) and otherwise by fuzzy street-address similarity within the same
|
||||
# postcode. A house number in the listing address is the strong disambiguator,
|
||||
# so a numbered listing may match on a lower street-similarity score than a
|
||||
# number-less one (which must match the street almost exactly to be trusted).
|
||||
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS = 82
|
||||
_LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS = 90
|
||||
_DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
|
||||
("_direct_epc_address", pl.Utf8),
|
||||
("_direct_current_energy_rating", pl.Utf8),
|
||||
|
|
@ -764,7 +759,7 @@ _DIRECT_EPC_COLUMNS: tuple[tuple[str, pl.DataType], ...] = (
|
|||
("_direct_was_council_house", pl.Utf8),
|
||||
("_direct_epc_match_status", pl.Utf8),
|
||||
("_direct_epc_match_score", pl.Float32),
|
||||
("_direct_epc_match_margin", pl.Float32),
|
||||
("_direct_epc_match_method", pl.Utf8),
|
||||
)
|
||||
_DIRECT_EPC_RAW_COLUMN_MAP = {
|
||||
"epc_address": "_direct_epc_address",
|
||||
|
|
@ -840,46 +835,6 @@ def _construction_year_expr(column: str = "construction_age_band") -> pl.Expr:
|
|||
)
|
||||
|
||||
|
||||
def _ratio_bonus(
|
||||
left: float | int | None, right: float | int | None, pct: float, cap: float
|
||||
) -> float:
|
||||
if left is None or right is None:
|
||||
return 0.0
|
||||
try:
|
||||
left_f = float(left)
|
||||
right_f = float(right)
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
if left_f <= 0 or right_f <= 0:
|
||||
return 0.0
|
||||
rel = abs(left_f - right_f) / max(left_f, right_f)
|
||||
if rel > pct:
|
||||
return 0.0
|
||||
return cap * (1.0 - rel / pct)
|
||||
|
||||
|
||||
def _rooms_bonus(left: int | None, right: int | None) -> float:
|
||||
if left is None or right is None:
|
||||
return 0.0
|
||||
try:
|
||||
diff = abs(int(left) - int(right))
|
||||
except (TypeError, ValueError):
|
||||
return 0.0
|
||||
if diff == 0:
|
||||
return 4.0
|
||||
if diff == 1:
|
||||
return 2.0
|
||||
return 0.0
|
||||
|
||||
|
||||
def _enum_bonus(
|
||||
left: str | None, right: str | None, *, exact: float, mismatch: float
|
||||
) -> float:
|
||||
if not left or not right:
|
||||
return 0.0
|
||||
return exact if left == right else mismatch
|
||||
|
||||
|
||||
def _address_score(query: str, candidate: str | None) -> int:
|
||||
if not candidate:
|
||||
return 0
|
||||
|
|
@ -893,6 +848,85 @@ def _has_number(address: str | None) -> bool:
|
|||
return bool(address and _NUMBER_RE.search(address))
|
||||
|
||||
|
||||
def _normalize_uprn(value: object) -> str | None:
|
||||
"""Canonical UPRN string (digits only) or None.
|
||||
|
||||
UPRNs arrive as strings or ints from the scraper / EPC register; normalise
|
||||
so a listing UPRN and an EPC/property UPRN compare equal regardless of dtype
|
||||
or stray whitespace. A float (e.g. a NaN-bearing column read as Float) is
|
||||
rejected unless it is an exact integer, so "123.0"/"1.5e11" can never be
|
||||
silently mangled into a bogus all-digits key.
|
||||
"""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, float):
|
||||
if not value.is_integer():
|
||||
return None
|
||||
value = int(value)
|
||||
digits = re.sub(r"\D", "", str(value))
|
||||
return digits or None
|
||||
|
||||
|
||||
def _best_listing_match(
|
||||
listing_uprn: str | None,
|
||||
query: str | None,
|
||||
uprn_index: dict[str, dict],
|
||||
bucket_candidates: list[dict],
|
||||
addressed_fields: list[str],
|
||||
) -> tuple[dict, float, str, str | None] | None:
|
||||
"""Pick the best candidate for a listing.
|
||||
|
||||
Matching is, in order: (1) an exact UPRN equality against the global
|
||||
``uprn_index`` (postcode-independent, so it is robust even when the
|
||||
listing's postcode is slightly off); (2) failing that, the highest
|
||||
fuzzy street-address similarity within the listing's own postcode bucket.
|
||||
No property-attribute heuristics are used — a house number in the listing
|
||||
address gates the fuzzy match (`_numbers_compatible`) and lowers the score
|
||||
threshold; a number-less address must match the street almost exactly.
|
||||
|
||||
``addressed_fields`` names the candidate columns to fuzzy-match against (a
|
||||
candidate may carry both a register and an EPC address). Returns
|
||||
``(candidate, score, method, matched_field)`` or None. ``method`` is
|
||||
"uprn" or "address"; ``matched_field`` is the winning address column (or
|
||||
None for a UPRN match).
|
||||
"""
|
||||
if listing_uprn:
|
||||
hit = uprn_index.get(listing_uprn)
|
||||
if hit is not None:
|
||||
return hit, 100.0, "uprn", None
|
||||
|
||||
if not query:
|
||||
return None
|
||||
|
||||
listing_has_numbers = _has_number(query)
|
||||
best: dict | None = None
|
||||
best_score = 0
|
||||
best_field: str | None = None
|
||||
for candidate in bucket_candidates:
|
||||
for field in addressed_fields:
|
||||
address = candidate.get(field)
|
||||
if not address:
|
||||
continue
|
||||
if listing_has_numbers and not _numbers_compatible(query, address):
|
||||
continue
|
||||
score = _address_score(query, address)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best = candidate
|
||||
best_field = field
|
||||
|
||||
if best is None:
|
||||
return None
|
||||
threshold = (
|
||||
_LISTING_MATCH_MIN_SCORE_WITH_NUMBERS
|
||||
if listing_has_numbers
|
||||
else _LISTING_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||
)
|
||||
if best_score < threshold:
|
||||
return None
|
||||
return best, float(best_score), "address", best_field
|
||||
|
||||
|
||||
def _load_listings_for_merge(
|
||||
listings_path: Path, arcgis_path: Path
|
||||
) -> pl.DataFrame:
|
||||
|
|
@ -908,6 +942,20 @@ def _load_listings_for_merge(
|
|||
raw = pl.scan_parquet(listings_path).with_row_index("_listing_idx")
|
||||
postcode_mapping = build_postcode_mapping(arcgis_path).lazy()
|
||||
|
||||
# UPRN is only present on scraped listings that carry it (Zoopla detail
|
||||
# pages); tolerate its absence so older parquets and test fixtures still
|
||||
# load. Digits-only so it compares equal to the EPC register's UPRN.
|
||||
if "UPRN" in raw.collect_schema().names():
|
||||
uprn_digits = pl.col("UPRN").cast(pl.Utf8).str.replace_all(r"\D", "")
|
||||
listing_uprn_expr = (
|
||||
pl.when(uprn_digits.str.len_chars() > 0)
|
||||
.then(uprn_digits)
|
||||
.otherwise(None)
|
||||
.alias("_listing_uprn")
|
||||
)
|
||||
else:
|
||||
listing_uprn_expr = pl.lit(None, dtype=pl.Utf8).alias("_listing_uprn")
|
||||
|
||||
# Listings parquets occasionally carry Float NaNs (e.g. floor area). Polars
|
||||
# treats NaN as distinct from null and the downstream `latest_price /
|
||||
# total_floor_area` cast to Int32 explodes on a NaN, so we normalise floats
|
||||
|
|
@ -936,12 +984,14 @@ def _load_listings_for_merge(
|
|||
"postcode"
|
||||
),
|
||||
pl.col("Address per Property Register").alias("pp_address"),
|
||||
listing_uprn_expr,
|
||||
*overlay,
|
||||
)
|
||||
.select(
|
||||
"_listing_idx",
|
||||
"postcode",
|
||||
"pp_address",
|
||||
"_listing_uprn",
|
||||
*[dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES],
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
|
|
@ -972,7 +1022,6 @@ def _empty_direct_epc_matches() -> pl.DataFrame:
|
|||
|
||||
def _load_direct_epc_candidates(
|
||||
epc_path: Path,
|
||||
arcgis_path: Path,
|
||||
listing_outcodes: list[str],
|
||||
temp_dir: Path,
|
||||
) -> pl.DataFrame:
|
||||
|
|
@ -982,8 +1031,7 @@ def _load_direct_epc_candidates(
|
|||
"_direct_epc_match_postcode": pl.Utf8,
|
||||
"_direct_epc_outcode": pl.Utf8,
|
||||
"_direct_epc_canonical_property_type": pl.Utf8,
|
||||
"_direct_epc_east": pl.Float64,
|
||||
"_direct_epc_north": pl.Float64,
|
||||
"_direct_epc_uprn": pl.Utf8,
|
||||
**{column: dtype for column, dtype in _DIRECT_EPC_COLUMNS if column.startswith("_direct_")},
|
||||
}
|
||||
if not listing_outcodes:
|
||||
|
|
@ -1016,12 +1064,6 @@ def _load_direct_epc_candidates(
|
|||
.with_columns(pl.lit("Yes").alias("_direct_was_council_house"))
|
||||
)
|
||||
|
||||
arcgis = pl.scan_parquet(arcgis_path).select(
|
||||
normalize_postcode_key(pl.col("pcds")).alias("_direct_epc_match_postcode"),
|
||||
pl.col("east1m").alias("_direct_epc_east"),
|
||||
pl.col("north1m").alias("_direct_epc_north"),
|
||||
)
|
||||
|
||||
return (
|
||||
epc_base.sort("inspection_date", descending=True)
|
||||
.group_by("_direct_epc_match_address", "_direct_epc_match_postcode")
|
||||
|
|
@ -1031,7 +1073,6 @@ def _load_direct_epc_candidates(
|
|||
on=["_direct_epc_match_address", "_direct_epc_match_postcode"],
|
||||
how="left",
|
||||
)
|
||||
.join(arcgis, on="_direct_epc_match_postcode", how="left")
|
||||
.with_columns(
|
||||
_canonical_epc_property_type_expr().alias(
|
||||
"_direct_epc_canonical_property_type"
|
||||
|
|
@ -1046,6 +1087,7 @@ def _load_direct_epc_candidates(
|
|||
.otherwise(None)
|
||||
.alias("_direct_potential_energy_rating"),
|
||||
pl.col("epc_address").alias("_direct_epc_address"),
|
||||
pl.col("uprn").alias("_direct_epc_uprn"),
|
||||
pl.col("total_floor_area").alias("_direct_total_floor_area"),
|
||||
pl.col("number_habitable_rooms").alias(
|
||||
"_direct_number_habitable_rooms"
|
||||
|
|
@ -1066,8 +1108,7 @@ def _load_direct_epc_candidates(
|
|||
"_direct_epc_match_postcode",
|
||||
"_direct_epc_outcode",
|
||||
"_direct_epc_canonical_property_type",
|
||||
"_direct_epc_east",
|
||||
"_direct_epc_north",
|
||||
"_direct_epc_uprn",
|
||||
"_direct_epc_address",
|
||||
"_direct_current_energy_rating",
|
||||
"_direct_potential_energy_rating",
|
||||
|
|
@ -1083,7 +1124,14 @@ def _load_direct_epc_candidates(
|
|||
|
||||
|
||||
def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
||||
match = listings.with_columns(
|
||||
"""Add the normalised address/postcode/outcode keys used to match listings.
|
||||
|
||||
Listings are matched to EPC certificates and properties by UPRN and by
|
||||
fuzzy street address within their (now accurate, detail-page-sourced)
|
||||
postcode — never by coordinate proximity — so no projected easting/northing
|
||||
is computed here. `_listing_uprn` flows through from the loaded listings.
|
||||
"""
|
||||
return listings.with_columns(
|
||||
normalize_address_key(pl.col("pp_address")).alias("_listing_match_address"),
|
||||
normalize_postcode_key(pl.col("postcode")).alias("_listing_match_postcode"),
|
||||
).with_columns(
|
||||
|
|
@ -1092,21 +1140,6 @@ def _listing_match_frame(listings: pl.DataFrame) -> pl.DataFrame:
|
|||
.alias("_listing_outcode")
|
||||
)
|
||||
|
||||
if match.is_empty():
|
||||
return match.with_columns(
|
||||
pl.Series("_listing_east", [], dtype=pl.Float64),
|
||||
pl.Series("_listing_north", [], dtype=pl.Float64),
|
||||
)
|
||||
|
||||
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
|
||||
east, north = transformer.transform(
|
||||
match["_actual_lon"].to_numpy(), match["_actual_lat"].to_numpy()
|
||||
)
|
||||
return match.with_columns(
|
||||
pl.Series("_listing_east", east, dtype=pl.Float64),
|
||||
pl.Series("_listing_north", north, dtype=pl.Float64),
|
||||
)
|
||||
|
||||
|
||||
def _optional_lazy_col(
|
||||
schema: pl.Schema, column: str, dtype: pl.DataType
|
||||
|
|
@ -1122,8 +1155,7 @@ def _listing_property_match_schema() -> dict[str, pl.DataType]:
|
|||
"_matched_postcode": pl.Utf8,
|
||||
"_matched_pp_address": pl.Utf8,
|
||||
"_property_match_score": pl.Float32,
|
||||
"_property_match_address_score": pl.Int32,
|
||||
"_property_match_margin": pl.Float32,
|
||||
"_property_match_method": pl.Utf8,
|
||||
"_property_match_field": pl.Utf8,
|
||||
}
|
||||
|
||||
|
|
@ -1139,11 +1171,8 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
|
|||
pl.col("postcode").cast(pl.Utf8).alias("postcode"),
|
||||
pl.col("pp_address").cast(pl.Utf8).alias("pp_address"),
|
||||
_optional_lazy_col(schema, "epc_address", pl.Utf8),
|
||||
_optional_lazy_col(schema, "pp_property_type", pl.Utf8),
|
||||
_optional_lazy_col(schema, "duration", pl.Utf8),
|
||||
_optional_lazy_col(schema, "total_floor_area", pl.Float64),
|
||||
_optional_lazy_col(schema, "number_habitable_rooms", pl.Int16),
|
||||
_optional_lazy_col(schema, "latest_price", pl.Int64),
|
||||
# UPRN keys the exact match; present once epc_pp is rebuilt with it.
|
||||
_optional_lazy_col(schema, "uprn", pl.Utf8),
|
||||
)
|
||||
.with_row_index("_property_row")
|
||||
.with_columns(
|
||||
|
|
@ -1167,110 +1196,52 @@ def _property_match_candidate_frame(wide: pl.LazyFrame) -> pl.DataFrame:
|
|||
)
|
||||
|
||||
|
||||
def _property_candidates_by_postcode(
|
||||
candidates: pl.DataFrame,
|
||||
) -> dict[str, list[dict]]:
|
||||
def _index_candidates(
|
||||
candidates: pl.DataFrame, postcode_key: str, uprn_key: str
|
||||
) -> tuple[dict[str, list[dict]], dict[str, dict]]:
|
||||
"""Index candidate rows for matching, in a single pass over the frame.
|
||||
|
||||
Returns ``(postcode_buckets, uprn_index)``. The postcode buckets drive the
|
||||
fuzzy street-address match; the UPRN index drives the exact match and is
|
||||
postcode-independent, so it still resolves when a listing's postcode is
|
||||
slightly off.
|
||||
"""
|
||||
buckets: dict[str, list[dict]] = {}
|
||||
uprn_index: dict[str, dict] = {}
|
||||
for row in candidates.iter_rows(named=True):
|
||||
postcode = row.get("_property_match_postcode")
|
||||
postcode = row.get(postcode_key)
|
||||
if postcode:
|
||||
buckets.setdefault(postcode, []).append(row)
|
||||
return buckets
|
||||
uprn = _normalize_uprn(row.get(uprn_key))
|
||||
if uprn and uprn not in uprn_index:
|
||||
uprn_index[uprn] = row
|
||||
return buckets, uprn_index
|
||||
|
||||
|
||||
def _best_listing_property_candidate(
|
||||
listing: dict, candidates: list[dict]
|
||||
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
|
||||
) -> dict | None:
|
||||
query = listing.get("_listing_match_address")
|
||||
if not query:
|
||||
return None
|
||||
|
||||
listing_has_numbers = _has_number(query)
|
||||
scored: list[tuple[float, int, dict, str]] = []
|
||||
for candidate in candidates:
|
||||
register_address = candidate.get("_property_match_address")
|
||||
epc_address = candidate.get("_property_epc_match_address")
|
||||
register_numbers_compatible = bool(
|
||||
register_address and _numbers_compatible(query, register_address)
|
||||
)
|
||||
epc_numbers_compatible = bool(
|
||||
epc_address and _numbers_compatible(query, epc_address)
|
||||
)
|
||||
if not (register_numbers_compatible or epc_numbers_compatible):
|
||||
continue
|
||||
|
||||
register_score = _address_score(query, register_address)
|
||||
epc_score = _address_score(query, epc_address)
|
||||
base_score = max(register_score, epc_score)
|
||||
if base_score == 0:
|
||||
continue
|
||||
|
||||
score = float(base_score)
|
||||
score += _enum_bonus(
|
||||
listing.get("_actual_property_type"),
|
||||
candidate.get("pp_property_type"),
|
||||
exact=7.0,
|
||||
mismatch=-8.0,
|
||||
)
|
||||
score += _enum_bonus(
|
||||
listing.get("_actual_leasehold_freehold"),
|
||||
candidate.get("duration"),
|
||||
exact=3.0,
|
||||
mismatch=-3.0,
|
||||
)
|
||||
score += _ratio_bonus(
|
||||
listing.get("_actual_total_floor_area"),
|
||||
candidate.get("total_floor_area"),
|
||||
pct=0.15,
|
||||
cap=8.0,
|
||||
)
|
||||
score += _rooms_bonus(
|
||||
listing.get("_actual_number_habitable_rooms"),
|
||||
candidate.get("number_habitable_rooms"),
|
||||
)
|
||||
score += _ratio_bonus(
|
||||
listing.get("_actual_asking_price"),
|
||||
candidate.get("latest_price"),
|
||||
pct=0.25,
|
||||
cap=3.0,
|
||||
)
|
||||
matched_field = (
|
||||
"pp_address" if register_score >= epc_score else "epc_address"
|
||||
)
|
||||
scored.append((score, base_score, candidate, matched_field))
|
||||
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[0], reverse=True)
|
||||
top = scored[0]
|
||||
runner_up = scored[1][0] if len(scored) > 1 else None
|
||||
margin = top[0] - runner_up if runner_up is not None else top[0]
|
||||
score_threshold = (
|
||||
_PROPERTY_MATCH_MIN_SCORE_WITH_NUMBERS
|
||||
if listing_has_numbers
|
||||
else _PROPERTY_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||
result = _best_listing_match(
|
||||
listing.get("_listing_uprn"),
|
||||
listing.get("_listing_match_address"),
|
||||
uprn_index,
|
||||
candidates,
|
||||
["_property_match_address", "_property_epc_match_address"],
|
||||
)
|
||||
address_threshold = (
|
||||
_PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITH_NUMBERS
|
||||
if listing_has_numbers
|
||||
else _PROPERTY_MATCH_MIN_ADDRESS_SCORE_WITHOUT_NUMBERS
|
||||
)
|
||||
if (
|
||||
top[0] < score_threshold
|
||||
or top[1] < address_threshold
|
||||
or margin < _PROPERTY_MATCH_MIN_MARGIN
|
||||
):
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
candidate = top[2]
|
||||
candidate, score, method, field = result
|
||||
matched_field = {
|
||||
"_property_match_address": "pp_address",
|
||||
"_property_epc_match_address": "epc_address",
|
||||
}.get(field, method)
|
||||
return {
|
||||
"_listing_idx": listing["_listing_idx"],
|
||||
"_matched_postcode": candidate.get("postcode"),
|
||||
"_matched_pp_address": candidate.get("pp_address"),
|
||||
"_property_match_score": round(top[0], 1),
|
||||
"_property_match_address_score": top[1],
|
||||
"_property_match_margin": round(margin, 1),
|
||||
"_property_match_field": top[3],
|
||||
"_property_match_score": round(score, 1),
|
||||
"_property_match_method": method,
|
||||
"_property_match_field": matched_field,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1280,23 +1251,32 @@ def _match_listing_properties(
|
|||
if listing_matches.is_empty() or property_candidates.is_empty():
|
||||
return _empty_listing_property_matches()
|
||||
|
||||
buckets = _property_candidates_by_postcode(property_candidates)
|
||||
buckets, uprn_index = _index_candidates(
|
||||
property_candidates, "_property_match_postcode", "uprn"
|
||||
)
|
||||
best_matches = []
|
||||
for listing in listing_matches.iter_rows(named=True):
|
||||
postcode = listing.get("_listing_match_postcode")
|
||||
if not postcode:
|
||||
continue
|
||||
match = _best_listing_property_candidate(listing, buckets.get(postcode, []))
|
||||
bucket = buckets.get(postcode, []) if postcode else []
|
||||
match = _best_listing_property_candidate(listing, uprn_index, bucket)
|
||||
if match is not None:
|
||||
best_matches.append(match)
|
||||
|
||||
if not best_matches:
|
||||
return _empty_listing_property_matches()
|
||||
|
||||
# When two listings claim the same property, keep the most authoritative
|
||||
# match: an exact UPRN match always wins over a fuzzy address match (both can
|
||||
# score 100, so method must break the tie before score and listing index).
|
||||
matches = pl.DataFrame(best_matches, schema=_listing_property_match_schema())
|
||||
return (
|
||||
matches.sort(
|
||||
["_property_match_score", "_listing_idx"], descending=[True, False]
|
||||
[
|
||||
pl.col("_property_match_method") == "uprn",
|
||||
"_property_match_score",
|
||||
"_listing_idx",
|
||||
],
|
||||
descending=[True, True, False],
|
||||
)
|
||||
.unique(
|
||||
["_matched_postcode", "_matched_pp_address"],
|
||||
|
|
@ -1307,133 +1287,19 @@ def _match_listing_properties(
|
|||
)
|
||||
|
||||
|
||||
def _epc_candidates_by_postcode(candidates: pl.DataFrame) -> dict[str, list[dict]]:
|
||||
buckets: dict[str, list[dict]] = {}
|
||||
for row in candidates.iter_rows(named=True):
|
||||
postcode = row.get("_direct_epc_match_postcode")
|
||||
if postcode:
|
||||
buckets.setdefault(postcode, []).append(row)
|
||||
return buckets
|
||||
|
||||
|
||||
def _epc_postcode_tree(
|
||||
candidates: pl.DataFrame,
|
||||
) -> tuple[cKDTree | None, list[str]]:
|
||||
postcode_points = (
|
||||
candidates.select(
|
||||
"_direct_epc_match_postcode",
|
||||
"_direct_epc_east",
|
||||
"_direct_epc_north",
|
||||
)
|
||||
.drop_nulls()
|
||||
.filter(
|
||||
pl.col("_direct_epc_east").is_finite()
|
||||
& pl.col("_direct_epc_north").is_finite()
|
||||
)
|
||||
.unique("_direct_epc_match_postcode")
|
||||
def _best_direct_epc_candidate(
|
||||
listing: dict, uprn_index: dict[str, dict], candidates: list[dict]
|
||||
) -> dict | None:
|
||||
result = _best_listing_match(
|
||||
listing.get("_listing_uprn"),
|
||||
listing.get("_listing_match_address"),
|
||||
uprn_index,
|
||||
candidates,
|
||||
["_direct_epc_match_address"],
|
||||
)
|
||||
if postcode_points.is_empty():
|
||||
return None, []
|
||||
coords = np.column_stack(
|
||||
[
|
||||
postcode_points["_direct_epc_east"].to_numpy(),
|
||||
postcode_points["_direct_epc_north"].to_numpy(),
|
||||
]
|
||||
)
|
||||
return cKDTree(coords), postcode_points["_direct_epc_match_postcode"].to_list()
|
||||
|
||||
|
||||
def _candidate_postcodes_for_listing(
|
||||
listing: dict,
|
||||
postcode_tree: cKDTree | None,
|
||||
postcode_values: list[str],
|
||||
) -> list[str]:
|
||||
postcodes: list[str] = []
|
||||
exact = listing.get("_listing_match_postcode")
|
||||
if exact:
|
||||
postcodes.append(exact)
|
||||
|
||||
if postcode_tree is None:
|
||||
return postcodes
|
||||
|
||||
east = listing.get("_listing_east")
|
||||
north = listing.get("_listing_north")
|
||||
try:
|
||||
east_f = float(east)
|
||||
north_f = float(north)
|
||||
except (TypeError, ValueError):
|
||||
return postcodes
|
||||
if not np.isfinite(east_f) or not np.isfinite(north_f):
|
||||
return postcodes
|
||||
|
||||
k = min(_DIRECT_EPC_NEAREST_POSTCODES, len(postcode_values))
|
||||
distances, indices = postcode_tree.query(
|
||||
[east_f, north_f],
|
||||
k=k,
|
||||
distance_upper_bound=_DIRECT_EPC_NEARBY_RADIUS_M,
|
||||
)
|
||||
distances = np.atleast_1d(distances)
|
||||
indices = np.atleast_1d(indices)
|
||||
seen = set(postcodes)
|
||||
for distance, idx in zip(distances, indices, strict=False):
|
||||
if not np.isfinite(distance) or idx >= len(postcode_values):
|
||||
continue
|
||||
postcode = postcode_values[int(idx)]
|
||||
if postcode not in seen:
|
||||
postcodes.append(postcode)
|
||||
seen.add(postcode)
|
||||
return postcodes
|
||||
|
||||
|
||||
def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict | None:
|
||||
query = listing.get("_listing_match_address")
|
||||
if not query:
|
||||
if result is None:
|
||||
return None
|
||||
|
||||
listing_has_numbers = _has_number(query)
|
||||
scored: list[tuple[float, int, dict]] = []
|
||||
for candidate in candidates:
|
||||
address = candidate.get("_direct_epc_match_address")
|
||||
if listing_has_numbers and not _numbers_compatible(query, address or ""):
|
||||
continue
|
||||
base_score = _address_score(query, address)
|
||||
if base_score == 0:
|
||||
continue
|
||||
|
||||
score = float(base_score)
|
||||
score += _enum_bonus(
|
||||
listing.get("_actual_property_type"),
|
||||
candidate.get("_direct_epc_canonical_property_type"),
|
||||
exact=6.0,
|
||||
mismatch=-6.0,
|
||||
)
|
||||
score += _ratio_bonus(
|
||||
listing.get("_actual_total_floor_area"),
|
||||
candidate.get("_direct_total_floor_area"),
|
||||
pct=0.12,
|
||||
cap=8.0,
|
||||
)
|
||||
score += _rooms_bonus(
|
||||
listing.get("_actual_number_habitable_rooms"),
|
||||
candidate.get("_direct_number_habitable_rooms"),
|
||||
)
|
||||
scored.append((score, base_score, candidate))
|
||||
|
||||
if not scored:
|
||||
return None
|
||||
scored.sort(key=lambda item: item[0], reverse=True)
|
||||
top = scored[0]
|
||||
runner_up = scored[1][0] if len(scored) > 1 else None
|
||||
margin = top[0] - runner_up if runner_up is not None else top[0]
|
||||
threshold = (
|
||||
_DIRECT_EPC_MATCH_MIN_SCORE_WITH_NUMBERS
|
||||
if listing_has_numbers
|
||||
else _DIRECT_EPC_MATCH_MIN_SCORE_WITHOUT_NUMBERS
|
||||
)
|
||||
if top[0] < threshold or margin < _DIRECT_EPC_MATCH_MIN_MARGIN:
|
||||
return None
|
||||
|
||||
candidate = top[2]
|
||||
candidate, score, method, _field = result
|
||||
return {
|
||||
"_listing_idx": listing["_listing_idx"],
|
||||
"_direct_epc_address": candidate.get("_direct_epc_address"),
|
||||
|
|
@ -1452,8 +1318,8 @@ def _best_direct_epc_candidate(listing: dict, candidates: list[dict]) -> dict |
|
|||
),
|
||||
"_direct_was_council_house": candidate.get("_direct_was_council_house"),
|
||||
"_direct_epc_match_status": "matched",
|
||||
"_direct_epc_match_score": round(top[0], 1),
|
||||
"_direct_epc_match_margin": round(margin, 1),
|
||||
"_direct_epc_match_score": round(score, 1),
|
||||
"_direct_epc_match_method": method,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -1463,25 +1329,14 @@ def _match_direct_epc(
|
|||
if listing_matches.is_empty() or epc_candidates.is_empty():
|
||||
return _empty_direct_epc_matches()
|
||||
|
||||
buckets = _epc_candidates_by_postcode(epc_candidates)
|
||||
postcode_tree, postcode_values = _epc_postcode_tree(epc_candidates)
|
||||
|
||||
buckets, uprn_index = _index_candidates(
|
||||
epc_candidates, "_direct_epc_match_postcode", "_direct_epc_uprn"
|
||||
)
|
||||
matches = []
|
||||
for listing in listing_matches.iter_rows(named=True):
|
||||
candidate_postcodes = _candidate_postcodes_for_listing(
|
||||
listing, postcode_tree, postcode_values
|
||||
)
|
||||
candidate_rows: list[dict] = []
|
||||
seen_rows: set[int] = set()
|
||||
for postcode in candidate_postcodes:
|
||||
for candidate in buckets.get(postcode, []):
|
||||
row = candidate.get("_direct_epc_row")
|
||||
if row in seen_rows:
|
||||
continue
|
||||
candidate_rows.append(candidate)
|
||||
if row is not None:
|
||||
seen_rows.add(row)
|
||||
match = _best_direct_epc_candidate(listing, candidate_rows)
|
||||
postcode = listing.get("_listing_match_postcode")
|
||||
bucket = buckets.get(postcode, []) if postcode else []
|
||||
match = _best_direct_epc_candidate(listing, uprn_index, bucket)
|
||||
if match is not None:
|
||||
matches.append(match)
|
||||
|
||||
|
|
@ -1493,7 +1348,6 @@ def _match_direct_epc(
|
|||
def _enrich_listings_with_direct_epc(
|
||||
listings: pl.DataFrame,
|
||||
epc_path: Path | None,
|
||||
arcgis_path: Path,
|
||||
) -> pl.DataFrame:
|
||||
if epc_path is None:
|
||||
return _ensure_direct_epc_columns(listings)
|
||||
|
|
@ -1513,7 +1367,7 @@ def _enrich_listings_with_direct_epc(
|
|||
prefix="direct_listing_epc_", dir=local_tmp_dir()
|
||||
) as tmpdir:
|
||||
epc_candidates = _load_direct_epc_candidates(
|
||||
epc_path, arcgis_path, listing_outcodes, Path(tmpdir)
|
||||
epc_path, listing_outcodes, Path(tmpdir)
|
||||
)
|
||||
print(f"Direct listing EPC candidates: {epc_candidates.height}")
|
||||
direct_matches = _match_direct_epc(listing_matches, epc_candidates)
|
||||
|
|
@ -1604,7 +1458,7 @@ def _integrate_listings(
|
|||
"""
|
||||
listings = _load_listings_for_merge(listings_path, arcgis_path)
|
||||
print(f"Listings loaded: {listings.height}")
|
||||
listings = _enrich_listings_with_direct_epc(listings, epc_path, arcgis_path)
|
||||
listings = _enrich_listings_with_direct_epc(listings, epc_path)
|
||||
|
||||
overlay_columns = [dst for _src, dst, _dt in _LISTING_OVERLAY_SOURCES]
|
||||
listing_attachment_columns = [
|
||||
|
|
@ -1660,6 +1514,14 @@ def _finalize_listings(df: pl.DataFrame) -> pl.DataFrame:
|
|||
"""Project the post-rename wide frame down to enriched-listing rows."""
|
||||
df = df.filter(pl.col(_LISTING_FLAG_COLUMN).is_not_null())
|
||||
|
||||
# A matched listing's overlay attaches to every wide row sharing its
|
||||
# (postcode, pp_address). The terminated-postcode remap can collapse several
|
||||
# distinct wide rows onto one such key, which would otherwise emit one duplicate
|
||||
# listing per collapsed row. Each listing matches exactly one (postcode,
|
||||
# pp_address) and each seed row carries a unique URL, so keeping a single row per
|
||||
# listing URL collapses only that fan-out and never merges distinct listings.
|
||||
df = df.unique(subset=[_LISTING_FLAG_COLUMN], keep="first", maintain_order=True)
|
||||
|
||||
df = df.with_columns(
|
||||
pl.col("_actual_listing_url").alias("Listing URL"),
|
||||
pl.col("_actual_listing_date").alias("Listing date"),
|
||||
|
|
@ -1750,7 +1612,6 @@ def _build(
|
|||
broadband_path: Path,
|
||||
conservation_areas_path: Path,
|
||||
rental_prices_path: Path,
|
||||
lsoa_population_path: Path,
|
||||
median_age_path: Path,
|
||||
election_results_path: Path,
|
||||
tree_density_postcodes_path: Path | None = None,
|
||||
|
|
@ -1881,8 +1742,10 @@ def _build(
|
|||
how="left",
|
||||
)
|
||||
|
||||
# Crime is counted spatially per postcode (incidents within 50m of the
|
||||
# postcode boundary), so it joins on postcode rather than LSOA.
|
||||
crime = pl.scan_parquet(crime_path)
|
||||
wide = wide.join(crime, left_on="lsoa21", right_on="LSOA code", how="left")
|
||||
wide = wide.join(crime, on="postcode", how="left")
|
||||
|
||||
wide = wide.with_columns(
|
||||
pl.sum_horizontal(
|
||||
|
|
@ -1905,17 +1768,6 @@ def _build(
|
|||
).alias("minor_crime_avg_yr"),
|
||||
)
|
||||
|
||||
lsoa_pop = pl.scan_parquet(lsoa_population_path)
|
||||
wide = wide.join(lsoa_pop, on="lsoa21", how="left")
|
||||
wide = wide.with_columns(
|
||||
pl.when(pl.col("population") > 0)
|
||||
.then((pl.col("serious_crime_avg_yr") / pl.col("population") * 1000).round(1))
|
||||
.alias("serious_crime_per_1k"),
|
||||
pl.when(pl.col("population") > 0)
|
||||
.then((pl.col("minor_crime_avg_yr") / pl.col("population") * 1000).round(1))
|
||||
.alias("minor_crime_per_1k"),
|
||||
).drop("population")
|
||||
|
||||
median_age = pl.scan_parquet(median_age_path)
|
||||
wide = wide.join(median_age, on="lsoa21", how="left")
|
||||
|
||||
|
|
@ -2082,8 +1934,6 @@ def _build(
|
|||
"max_download_speed": "Max available download speed (Mbps)",
|
||||
"serious_crime_avg_yr": "Serious crime (avg/yr)",
|
||||
"minor_crime_avg_yr": "Minor crime (avg/yr)",
|
||||
"serious_crime_per_1k": "Serious crime per 1k residents (avg/yr)",
|
||||
"minor_crime_per_1k": "Minor crime per 1k residents (avg/yr)",
|
||||
"mean_monthly_rent": "Estimated monthly rent",
|
||||
"floor_height": "Interior height (m)",
|
||||
"was_council_house": "Former council house",
|
||||
|
|
@ -2189,12 +2039,6 @@ def main():
|
|||
required=True,
|
||||
help="ONS rental prices by LA and bedroom count parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lsoa-population",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="Census 2021 population by LSOA parquet file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--median-age",
|
||||
type=Path,
|
||||
|
|
@ -2279,7 +2123,6 @@ def main():
|
|||
broadband_path=args.broadband,
|
||||
conservation_areas_path=args.conservation_areas,
|
||||
rental_prices_path=args.rental_prices,
|
||||
lsoa_population_path=args.lsoa_population,
|
||||
median_age_path=args.median_age,
|
||||
election_results_path=args.election_results,
|
||||
tree_density_postcodes_path=args.tree_density_postcodes,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue