Fable findings in data
This commit is contained in:
parent
b98bc6d611
commit
6a33b03fdf
20 changed files with 1502 additions and 274 deletions
|
|
@ -31,6 +31,22 @@ RATING_RANK = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
|
|||
# conservative tradeoff to keep clearly-implausible transfers out.
|
||||
MIN_PRICE = 10_000
|
||||
|
||||
# Time-aware consecutive-sale jump guard. Price-paid contains keyed-in price
|
||||
# errors that pass the MIN_PRICE/category filters — e.g. 13 QUICKSETTS HR2 7PP,
|
||||
# a 93 m² terrace, sold £140,000 in 2016 then "£207,500,000" in 2026 (clearly
|
||||
# £207,500 with extra digits, lodged as category A) — and would otherwise
|
||||
# become latest_price. A quality sale is flagged when it exceeds its
|
||||
# neighbouring sale by more than JUMP_TOLERANCE * JUMP_GROWTH_PER_YEAR ** years
|
||||
# between the two sales. Calibration: genuine extreme appreciation (prime
|
||||
# London 1995->2026 is roughly x50 over 31 years) stays comfortably under
|
||||
# 12 * 1.10**31 ≈ 230, while the HR2 case (x1,482 over 10 years against a
|
||||
# threshold of 12 * 1.10**10 ≈ 31) is caught. JUMP_MIN_PRICE is an absolute
|
||||
# floor on the flagged price itself so right-to-buy resales and other
|
||||
# legitimate x20-50 jumps on cheap properties are never flagged.
|
||||
JUMP_TOLERANCE = 12.0
|
||||
JUMP_GROWTH_PER_YEAR = 1.10
|
||||
JUMP_MIN_PRICE = 2_000_000
|
||||
|
||||
# Plausible construction-year range; band-derived years outside it (e.g. OCR
|
||||
# noise like 1012 or 2202) are nulled rather than published.
|
||||
MIN_BUILD_YEAR = 1700
|
||||
|
|
@ -286,6 +302,64 @@ def _scan_epc_certificates(epc_path: Path, temp_dir: Path) -> pl.LazyFrame:
|
|||
return _select_epc_columns(raw)
|
||||
|
||||
|
||||
def flag_price_outliers(slim: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Flag the implausible side of extreme consecutive-sale price jumps.
|
||||
|
||||
``slim`` holds one row per quality (>= MIN_PRICE, category A) sale:
|
||||
(_pp_group_address, _pp_group_postcode, date_of_transfer, price). Per
|
||||
property, each sale is compared against its previous and next sale and
|
||||
the HIGHER sale of an implausible pair is flagged:
|
||||
|
||||
- UP rule: the sale is more than the time-aware threshold above its
|
||||
PREVIOUS sale (catches a garbage spike after a normal sale);
|
||||
- DOWN rule: the NEXT sale is less than 1/threshold of this one (catches
|
||||
a garbage spike before a normal sale);
|
||||
- either way the flagged price itself must be >= JUMP_MIN_PRICE, so
|
||||
cheap-property noise and right-to-buy-style resales stay safe.
|
||||
|
||||
Runs as a bounded EAGER pass: .shift().over() window functions may not
|
||||
execute under the streaming sink used by fuzzy_join_on_postcode, so the
|
||||
flags are computed here and left-joined back into the lazy stream.
|
||||
|
||||
Returns the exclusion rows (group keys, date_of_transfer, price) with a
|
||||
literal ``_price_outlier`` column, unique on the four join columns so
|
||||
the join-back can never fan out.
|
||||
"""
|
||||
group_keys = ["_pp_group_address", "_pp_group_postcode"]
|
||||
# Years between consecutive sales, floored at six months so back-to-back
|
||||
# transfers don't get a near-zero exponent and an over-tight threshold.
|
||||
dy_prev = (
|
||||
(pl.col("date_of_transfer") - pl.col("_prev_date")).dt.total_days() / 365.25
|
||||
).clip(lower_bound=0.5)
|
||||
dy_next = (
|
||||
(pl.col("_next_date") - pl.col("date_of_transfer")).dt.total_days() / 365.25
|
||||
).clip(lower_bound=0.5)
|
||||
up_rule = (pl.col("price") / pl.col("_prev_price")) > JUMP_TOLERANCE * pl.lit(
|
||||
JUMP_GROWTH_PER_YEAR
|
||||
).pow(dy_prev)
|
||||
down_rule = (pl.col("_next_price") / pl.col("price")) < 1 / (
|
||||
JUMP_TOLERANCE * pl.lit(JUMP_GROWTH_PER_YEAR).pow(dy_next)
|
||||
)
|
||||
return (
|
||||
slim.sort([*group_keys, "date_of_transfer"])
|
||||
.with_columns(
|
||||
pl.col("price").shift(1).over(group_keys).alias("_prev_price"),
|
||||
pl.col("date_of_transfer").shift(1).over(group_keys).alias("_prev_date"),
|
||||
pl.col("price").shift(-1).over(group_keys).alias("_next_price"),
|
||||
pl.col("date_of_transfer").shift(-1).over(group_keys).alias("_next_date"),
|
||||
)
|
||||
# fill_null(False): a missing neighbour (first/last sale of a group)
|
||||
# makes that rule's comparison null, which must read as "not flagged".
|
||||
.filter(
|
||||
(up_rule.fill_null(False) | down_rule.fill_null(False))
|
||||
& (pl.col("price") >= JUMP_MIN_PRICE)
|
||||
)
|
||||
.select(*group_keys, "date_of_transfer", "price")
|
||||
.unique()
|
||||
.with_columns(pl.lit(True).alias("_price_outlier"))
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Fuzzy join EPC and Price Paid data")
|
||||
parser.add_argument(
|
||||
|
|
@ -429,15 +503,19 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
|
||||
# price >= MIN_PRICE and ppd_category == "A" (standard open-market sale) are
|
||||
# VALUE-QUALITY filters: they gate the price aggregations only. Category B
|
||||
# entries (repossessions, bulk/portfolio, power-of-sale transfers) and sub-MIN
|
||||
# sales must not pollute latest_price / historical_prices (and the downstream
|
||||
# price-per-sqm feature), but they MUST still count for first_transfer_date /
|
||||
# old_new so a new-build's genuine earliest transfer year is preserved.
|
||||
# entries (repossessions, bulk/portfolio, power-of-sale transfers), sub-MIN
|
||||
# sales and jump-flagged outliers must not pollute latest_price /
|
||||
# historical_prices (and the downstream price-per-sqm feature), but they
|
||||
# MUST still count for first_transfer_date / old_new so a new-build's
|
||||
# genuine earliest transfer year is preserved.
|
||||
price_ok = pl.col("price") >= MIN_PRICE
|
||||
category_ok = pl.col("ppd_category") == "A"
|
||||
quality_ok = price_ok & category_ok
|
||||
value_ok = price_ok & category_ok
|
||||
# quality_ok additionally excludes consecutive-sale jump outliers (see
|
||||
# flag_price_outliers); _price_outlier exists only after the join below.
|
||||
quality_ok = value_ok & pl.col("_price_outlier").is_null()
|
||||
|
||||
price_paid = (
|
||||
price_paid_base = (
|
||||
pl.scan_parquet(price_paid_path)
|
||||
.select(
|
||||
"price",
|
||||
|
|
@ -469,6 +547,52 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
pl.col("_pp_match_postcode").alias("_pp_group_postcode"),
|
||||
)
|
||||
.filter(pl.col("pp_address").is_not_null())
|
||||
# Price-paid carries ~72k duplicate (address, postcode, date, price)
|
||||
# transaction groups with DISTINCT transaction ids — the same completed
|
||||
# sale lodged twice — which double-counted sales in historical_prices.
|
||||
# Collapse each to one row. ppd_category stays in the subset so an
|
||||
# A/B-categorised pair of the same sale survives as two rows; only the
|
||||
# A row feeds the price aggregations (quality_ok), which is intentional.
|
||||
.unique(
|
||||
subset=[
|
||||
"_pp_group_address",
|
||||
"_pp_group_postcode",
|
||||
"date_of_transfer",
|
||||
"price",
|
||||
"ppd_category",
|
||||
],
|
||||
keep="any",
|
||||
)
|
||||
)
|
||||
|
||||
# Bounded eager pass over the quality sales only (~30M rows x 4 narrow
|
||||
# columns): the window functions inside flag_price_outliers may not run
|
||||
# under the streaming sink used by fuzzy_join_on_postcode, so the outlier
|
||||
# flags are computed here and joined back into the lazy stream.
|
||||
outliers = flag_price_outliers(
|
||||
price_paid_base.filter(value_ok)
|
||||
.select(
|
||||
"_pp_group_address", "_pp_group_postcode", "date_of_transfer", "price"
|
||||
)
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
print(f"Implausible consecutive-sale price jumps flagged: {outliers.height}")
|
||||
|
||||
price_paid = (
|
||||
# Outlier rows stay in the stream (they still count for
|
||||
# first_transfer_date / old_new, same as category-B sales); quality_ok
|
||||
# merely drops them from the price aggregations. _price_outlier is not
|
||||
# aggregated below, so the helper column dies with the group_by.
|
||||
price_paid_base.join(
|
||||
outliers.lazy(),
|
||||
on=[
|
||||
"_pp_group_address",
|
||||
"_pp_group_postcode",
|
||||
"date_of_transfer",
|
||||
"price",
|
||||
],
|
||||
how="left",
|
||||
)
|
||||
.sort("date_of_transfer")
|
||||
.group_by("_pp_group_address", "_pp_group_postcode", maintain_order=True)
|
||||
.agg(
|
||||
|
|
@ -511,6 +635,9 @@ def _run(epc_path: Path, price_paid_path: Path, output_path: Path, temp_dir: Pat
|
|||
right_postcode_col="epc_postcode",
|
||||
)
|
||||
.drop("epc_postcode")
|
||||
# Audit trail: keep the fuzzy-match confidence (100 = exact address
|
||||
# match) in the published output; null means no EPC match.
|
||||
.rename({"_match_score": "epc_match_score"})
|
||||
.collect(engine="streaming")
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue