Fable findings in data

2026-06-11 07:49:23 +01:00 · 2026-06-11 07:49:23 +01:00 · 6a33b03fdf
commit 6a33b03fdf
parent b98bc6d611
20 changed files with 1502 additions and 274 deletions
--- a/pipeline/transform/join_epc_pp.py
+++ b/pipeline/transform/join_epc_pp.py
@ -31,6 +31,22 @@ RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
 # conservative tradeoff to keep clearly-implausible transfers out.
 MIN_PRICE = 10_000

+# Time-aware consecutive-sale jump guard. Price-paid contains keyed-in price
+# errors that pass the MIN_PRICE/category filters — e.g. 13 QUICKSETTS HR2 7PP,
+# a 93 m² terrace, sold £140,000 in 2016 then "£207,500,000" in 2026 (clearly
+# £207,500 with extra digits, lodged as category A) — and would otherwise
+# become latest_price. A quality sale is flagged when it exceeds its
+# neighbouring sale by more than JUMP_TOLERANCE * JUMP_GROWTH_PER_YEAR ** years
+# between the two sales. Calibration: genuine extreme appreciation (prime
+# London 1995->2026 is roughly x50 over 31 years) stays comfortably under
+# 12 * 1.10**31 ≈ 230, while the HR2 case (x1,482 over 10 years against a
+# threshold of 12 * 1.10**10 ≈ 31) is caught. JUMP_MIN_PRICE is an absolute
+# floor on the flagged price itself so right-to-buy resales and other
+# legitimate x20-50 jumps on cheap properties are never flagged.
+JUMP_TOLERANCE = 12.0
+JUMP_GROWTH_PER_YEAR = 1.10
+JUMP_MIN_PRICE = 2_000_000
+
 # Plausible construction-year range; band-derived years outside it (e.g. OCR
 # noise like 1012 or 2202) are nulled rather than published.
 MIN_BUILD_YEAR = 1700
@ -286,6 +302,64 @@ def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
    return _select_epc_columns(raw)


+def flag_price_outliers(slim: pl.DataFrame) -> pl.DataFrame:
+    """Flag the implausible side of extreme consecutive-sale price jumps.
+
+    ``slim`` holds one row per quality (>= MIN_PRICE, category A) sale:
+    (_pp_group_address, _pp_group_postcode, date_of_transfer, price). Per
+    property, each sale is compared against its previous and next sale and
+    the HIGHER sale of an implausible pair is flagged:
+
+    - UP rule: the sale is more than the time-aware threshold above its
+      PREVIOUS sale (catches a garbage spike after a normal sale);
+    - DOWN rule: the NEXT sale is less than 1/threshold of this one (catches
+      a garbage spike before a normal sale);
+    - either way the flagged price itself must be >= JUMP_MIN_PRICE, so
+      cheap-property noise and right-to-buy-style resales stay safe.
+
+    Runs as a bounded EAGER pass: .shift().over() window functions may not
+    execute under the streaming sink used by fuzzy_join_on_postcode, so the
+    flags are computed here and left-joined back into the lazy stream.
+
+    Returns the exclusion rows (group keys, date_of_transfer, price) with a
+    literal ``_price_outlier`` column, unique on the four join columns so
+    the join-back can never fan out.
+    """
+    group_keys = ["_pp_group_address", "_pp_group_postcode"]
+    # Years between consecutive sales, floored at six months so back-to-back
+    # transfers don't get a near-zero exponent and an over-tight threshold.
+    dy_prev = (
+        (pl.col("date_of_transfer") - pl.col("_prev_date")).dt.total_days() / 365.25
+    ).clip(lower_bound=0.5)
+    dy_next = (
+        (pl.col("_next_date") - pl.col("date_of_transfer")).dt.total_days() / 365.25
+    ).clip(lower_bound=0.5)
+    up_rule = (pl.col("price") / pl.col("_prev_price")) > JUMP_TOLERANCE * pl.lit(
+        JUMP_GROWTH_PER_YEAR
+    ).pow(dy_prev)
+    down_rule = (pl.col("_next_price") / pl.col("price")) < 1 / (
+        JUMP_TOLERANCE * pl.lit(JUMP_GROWTH_PER_YEAR).pow(dy_next)
+    )
+    return (
+        slim.sort([*group_keys, "date_of_transfer"])
+        .with_columns(
+            pl.col("price").shift(1).over(group_keys).alias("_prev_price"),
+            pl.col("date_of_transfer").shift(1).over(group_keys).alias("_prev_date"),
+            pl.col("price").shift(-1).over(group_keys).alias("_next_price"),
+            pl.col("date_of_transfer").shift(-1).over(group_keys).alias("_next_date"),
+        )
+        # fill_null(False): a missing neighbour (first/last sale of a group)
+        # makes that rule's comparison null, which must read as "not flagged".
+        .filter(
+            (up_rule.fill_null(False) | down_rule.fill_null(False))
+            & (pl.col("price") >= JUMP_MIN_PRICE)
+        )
+        .select(*group_keys, "date_of_transfer", "price")
+        .unique()
+        .with_columns(pl.lit(True).alias("_price_outlier"))
+    )
+
+
 def main():
    parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
    parser.add_argument(
@ -429,15 +503,19 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat

    # price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
    # VALUE-QUALITY filters: they gate the price aggregations only. Category B
-    # entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
-    # sales must not pollute latest_price / historical_prices (and the downstream
-    # price-per-sqm feature), but they MUST still count for first_transfer_date /
-    # old_new so a new-build's genuine earliest transfer year is preserved.
+    # entries (repossessions, bulk/portfolio, power-of-sale transfers), sub-MIN
+    # sales and jump-flagged outliers must not pollute latest_price /
+    # historical_prices (and the downstream price-per-sqm feature), but they
+    # MUST still count for first_transfer_date / old_new so a new-build's
+    # genuine earliest transfer year is preserved.
    price_ok = pl.col("price") >= MIN_PRICE
    category_ok = pl.col("ppd_category") == "A"
-    quality_ok = price_ok & category_ok
+    value_ok = price_ok & category_ok
+    # quality_ok additionally excludes consecutive-sale jump outliers (see
+    # flag_price_outliers); _price_outlier exists only after the join below.
+    quality_ok = value_ok & pl.col("_price_outlier").is_null()

-    price_paid = (
+    price_paid_base = (
        pl.scan_parquet(price_paid_path)
        .select(
            "price",
@ -469,6 +547,52 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
        )
        .filter(pl.col("pp_address").is_not_null())
+        # Price-paid carries ~72k duplicate (address, postcode, date, price)
+        # transaction groups with DISTINCT transaction ids — the same completed
+        # sale lodged twice — which double-counted sales in historical_prices.
+        # Collapse each to one row. ppd_category stays in the subset so an
+        # A/B-categorised pair of the same sale survives as two rows; only the
+        # A row feeds the price aggregations (quality_ok), which is intentional.
+        .unique(
+            subset=[
+                "_pp_group_address",
+                "_pp_group_postcode",
+                "date_of_transfer",
+                "price",
+                "ppd_category",
+            ],
+            keep="any",
+        )
+    )
+
+    # Bounded eager pass over the quality sales only (~30M rows x 4 narrow
+    # columns): the window functions inside flag_price_outliers may not run
+    # under the streaming sink used by fuzzy_join_on_postcode, so the outlier
+    # flags are computed here and joined back into the lazy stream.
+    outliers = flag_price_outliers(
+        price_paid_base.filter(value_ok)
+        .select(
+            "_pp_group_address", "_pp_group_postcode", "date_of_transfer", "price"
+        )
+        .collect(engine="streaming")
+    )
+    print(f"Implausible consecutive-sale price jumps flagged: {outliers.height}")
+
+    price_paid = (
+        # Outlier rows stay in the stream (they still count for
+        # first_transfer_date / old_new, same as category-B sales); quality_ok
+        # merely drops them from the price aggregations. _price_outlier is not
+        # aggregated below, so the helper column dies with the group_by.
+        price_paid_base.join(
+            outliers.lazy(),
+            on=[
+                "_pp_group_address",
+                "_pp_group_postcode",
+                "date_of_transfer",
+                "price",
+            ],
+            how="left",
+        )
        .sort("date_of_transfer")
        .group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
        .agg(
@ -511,6 +635,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
            right_postcode_col="epc_postcode",
        )
        .drop("epc_postcode")
+        # Audit trail: keep the fuzzy-match confidence (100 = exact address
+        # match) in the published output; null means no EPC match.
+        .rename({"_match_score": "epc_match_score"})
        .collect(engine="streaming")
    )