Fable findings in data

This commit is contained in:
Andras Schmelczer 2026-06-11 07:49:23 +01:00
parent b98bc6d611
commit 6a33b03fdf
20 changed files with 1502 additions and 274 deletions

View file

@ -31,6 +31,22 @@ RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
# conservative tradeoff to keep clearly-implausible transfers out.
MIN_PRICE = 10_000
# Time-aware consecutive-sale jump guard. Price-paid contains keyed-in price
# errors that pass the MIN_PRICE/category filters — e.g. 13 QUICKSETTS HR2 7PP,
# a 93 m² terrace, sold £140,000 in 2016 then "£207,500,000" in 2026 (clearly
# £207,500 with extra digits, lodged as category A) — and would otherwise
# become latest_price. A quality sale is flagged when it exceeds its
# neighbouring sale by more than JUMP_TOLERANCE * JUMP_GROWTH_PER_YEAR ** years
# between the two sales. Calibration: genuine extreme appreciation (prime
# London 1995->2026 is roughly x50 over 31 years) stays comfortably under
# 12 * 1.10**31 ≈ 230, while the HR2 case (x1,482 over 10 years against a
# threshold of 12 * 1.10**10 ≈ 31) is caught. JUMP_MIN_PRICE is an absolute
# floor on the flagged price itself so right-to-buy resales and other
# legitimate x20-50 jumps on cheap properties are never flagged.
JUMP_TOLERANCE = 12.0
JUMP_GROWTH_PER_YEAR = 1.10
JUMP_MIN_PRICE = 2_000_000
# Plausible construction-year range; band-derived years outside it (e.g. OCR
# noise like 1012 or 2202) are nulled rather than published.
MIN_BUILD_YEAR = 1700
@ -286,6 +302,64 @@ def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
return _select_epc_columns(raw)
def flag_price_outliers(slim: pl.DataFrame) -> pl.DataFrame:
"""Flag the implausible side of extreme consecutive-sale price jumps.
``slim`` holds one row per quality (>= MIN_PRICE, category A) sale:
(_pp_group_address, _pp_group_postcode, date_of_transfer, price). Per
property, each sale is compared against its previous and next sale and
the HIGHER sale of an implausible pair is flagged:
- UP rule: the sale is more than the time-aware threshold above its
PREVIOUS sale (catches a garbage spike after a normal sale);
- DOWN rule: the NEXT sale is less than 1/threshold of this one (catches
a garbage spike before a normal sale);
- either way the flagged price itself must be >= JUMP_MIN_PRICE, so
cheap-property noise and right-to-buy-style resales stay safe.
Runs as a bounded EAGER pass: .shift().over() window functions may not
execute under the streaming sink used by fuzzy_join_on_postcode, so the
flags are computed here and left-joined back into the lazy stream.
Returns the exclusion rows (group keys, date_of_transfer, price) with a
literal ``_price_outlier`` column, unique on the four join columns so
the join-back can never fan out.
"""
group_keys = ["_pp_group_address", "_pp_group_postcode"]
# Years between consecutive sales, floored at six months so back-to-back
# transfers don't get a near-zero exponent and an over-tight threshold.
dy_prev = (
(pl.col("date_of_transfer") - pl.col("_prev_date")).dt.total_days() / 365.25
).clip(lower_bound=0.5)
dy_next = (
(pl.col("_next_date") - pl.col("date_of_transfer")).dt.total_days() / 365.25
).clip(lower_bound=0.5)
up_rule = (pl.col("price") / pl.col("_prev_price")) > JUMP_TOLERANCE * pl.lit(
JUMP_GROWTH_PER_YEAR
).pow(dy_prev)
down_rule = (pl.col("_next_price") / pl.col("price")) < 1 / (
JUMP_TOLERANCE * pl.lit(JUMP_GROWTH_PER_YEAR).pow(dy_next)
)
return (
slim.sort([*group_keys, "date_of_transfer"])
.with_columns(
pl.col("price").shift(1).over(group_keys).alias("_prev_price"),
pl.col("date_of_transfer").shift(1).over(group_keys).alias("_prev_date"),
pl.col("price").shift(-1).over(group_keys).alias("_next_price"),
pl.col("date_of_transfer").shift(-1).over(group_keys).alias("_next_date"),
)
# fill_null(False): a missing neighbour (first/last sale of a group)
# makes that rule's comparison null, which must read as "not flagged".
.filter(
(up_rule.fill_null(False) | down_rule.fill_null(False))
& (pl.col("price") >= JUMP_MIN_PRICE)
)
.select(*group_keys, "date_of_transfer", "price")
.unique()
.with_columns(pl.lit(True).alias("_price_outlier"))
)
def main():
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
parser.add_argument(
@ -429,15 +503,19 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
# price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
# VALUE-QUALITY filters: they gate the price aggregations only. Category B
# entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
# sales must not pollute latest_price / historical_prices (and the downstream
# price-per-sqm feature), but they MUST still count for first_transfer_date /
# old_new so a new-build's genuine earliest transfer year is preserved.
# entries (repossessions, bulk/portfolio, power-of-sale transfers), sub-MIN
# sales and jump-flagged outliers must not pollute latest_price /
# historical_prices (and the downstream price-per-sqm feature), but they
# MUST still count for first_transfer_date / old_new so a new-build's
# genuine earliest transfer year is preserved.
price_ok = pl.col("price") >= MIN_PRICE
category_ok = pl.col("ppd_category") == "A"
quality_ok = price_ok & category_ok
value_ok = price_ok & category_ok
# quality_ok additionally excludes consecutive-sale jump outliers (see
# flag_price_outliers); _price_outlier exists only after the join below.
quality_ok = value_ok & pl.col("_price_outlier").is_null()
price_paid = (
price_paid_base = (
pl.scan_parquet(price_paid_path)
.select(
"price",
@ -469,6 +547,52 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
)
.filter(pl.col("pp_address").is_not_null())
# Price-paid carries ~72k duplicate (address, postcode, date, price)
# transaction groups with DISTINCT transaction ids — the same completed
# sale lodged twice — which double-counted sales in historical_prices.
# Collapse each to one row. ppd_category stays in the subset so an
# A/B-categorised pair of the same sale survives as two rows; only the
# A row feeds the price aggregations (quality_ok), which is intentional.
.unique(
subset=[
"_pp_group_address",
"_pp_group_postcode",
"date_of_transfer",
"price",
"ppd_category",
],
keep="any",
)
)
# Bounded eager pass over the quality sales only (~30M rows x 4 narrow
# columns): the window functions inside flag_price_outliers may not run
# under the streaming sink used by fuzzy_join_on_postcode, so the outlier
# flags are computed here and joined back into the lazy stream.
outliers = flag_price_outliers(
price_paid_base.filter(value_ok)
.select(
"_pp_group_address", "_pp_group_postcode", "date_of_transfer", "price"
)
.collect(engine="streaming")
)
print(f"Implausible consecutive-sale price jumps flagged: {outliers.height}")
price_paid = (
# Outlier rows stay in the stream (they still count for
# first_transfer_date / old_new, same as category-B sales); quality_ok
# merely drops them from the price aggregations. _price_outlier is not
# aggregated below, so the helper column dies with the group_by.
price_paid_base.join(
outliers.lazy(),
on=[
"_pp_group_address",
"_pp_group_postcode",
"date_of_transfer",
"price",
],
how="left",
)
.sort("date_of_transfer")
.group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
.agg(
@ -511,6 +635,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
right_postcode_col="epc_postcode",
)
.drop("epc_postcode")
# Audit trail: keep the fuzzy-match confidence (100 = exact address
# match) in the published output; null means no EPC match.
.rename({"_match_score": "epc_match_score"})
.collect(engine="streaming")
)