idgf
This commit is contained in:
parent
fbfebc651c
commit
aab85fe32e
33 changed files with 2016 additions and 283 deletions
|
|
@ -19,8 +19,7 @@ from tqdm import tqdm
|
|||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
hierarchical_shrinkage,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
spatial_smooth,
|
||||
)
|
||||
|
|
@ -169,33 +168,47 @@ def solve_robust_index(
|
|||
signs_arr = np.concatenate([np.ones(mask2.sum()), -np.ones(mask1.sum())])
|
||||
|
||||
# Temporal smoothness prior: penalise curvature in the year betas with a
|
||||
# second-difference penalty lambda * (beta_t - 2*beta_{t-1} + beta_{t-2})^2,
|
||||
# encoded as extra least-squares rows (sqrt(lambda) * [1, -2, 1] against a
|
||||
# zero target). This damps single-year index spikes without flattening
|
||||
# genuine multi-year trends. Betas are ordered by calendar year; the baseline
|
||||
# year (min_year, implicit beta=0) has no column, so the penalty spans the
|
||||
# non-baseline years only. For cells with <3 betas there is no curvature to
|
||||
# penalise and the solve is unchanged.
|
||||
# second-difference penalty lambda * (d2 beta / dt2)^2, encoded as extra
|
||||
# least-squares rows (sqrt(lambda) * [w0, w1, w2] against a zero target).
|
||||
# The weights are the CALENDAR-SPACING-AWARE second-derivative coefficients
|
||||
# for the consecutive triple (y0, y1, y2), so gap years are not treated as
|
||||
# adjacent: a multi-year gap relaxes the penalty (correctly preserving a
|
||||
# genuine level jump) instead of forcing a smooth ramp. For unit spacing
|
||||
# (1, 1) these reduce to [1, -2, 1], leaving contiguous cells unchanged.
|
||||
# This damps single-year index spikes without flattening genuine trends.
|
||||
# Betas are ordered by calendar year; the baseline year (min_year, implicit
|
||||
# beta=0) has no column, so the penalty spans the non-baseline years only.
|
||||
# For cells with <3 betas there is no curvature to penalise and the solve is
|
||||
# unchanged.
|
||||
n_pen = 0
|
||||
pen_rows_arr = pen_cols_arr = np.empty(0, dtype=np.int64)
|
||||
pen_vals_arr = pen_b = np.empty(0, dtype=np.float64)
|
||||
if TEMPORAL_SMOOTHNESS_LAMBDA > 0 and n_cols >= 3:
|
||||
sqrt_lambda = float(np.sqrt(TEMPORAL_SMOOTHNESS_LAMBDA))
|
||||
cols_by_year = [c for _, c in sorted(year_to_col.items())]
|
||||
years_sorted = sorted(year_to_col)
|
||||
cols_by_year = [year_to_col[y] for y in years_sorted]
|
||||
n_pen = n_cols - 2
|
||||
pen_rows = np.repeat(n + np.arange(n_pen), 3)
|
||||
pen_cols = np.empty(n_pen * 3, dtype=np.int64)
|
||||
pen_vals = np.empty(n_pen * 3, dtype=np.float64)
|
||||
for k in range(n_pen):
|
||||
pen_cols[3 * k : 3 * k + 3] = (
|
||||
cols_by_year[k],
|
||||
cols_by_year[k + 1],
|
||||
cols_by_year[k + 2],
|
||||
)
|
||||
y0, y1, y2 = years_sorted[k], years_sorted[k + 1], years_sorted[k + 2]
|
||||
w0 = 2.0 / ((y1 - y0) * (y2 - y0))
|
||||
w1 = -2.0 / ((y1 - y0) * (y2 - y1))
|
||||
w2 = 2.0 / ((y2 - y1) * (y2 - y0))
|
||||
pen_vals[3 * k : 3 * k + 3] = (
|
||||
sqrt_lambda * w0,
|
||||
sqrt_lambda * w1,
|
||||
sqrt_lambda * w2,
|
||||
)
|
||||
pen_rows_arr = pen_rows.astype(np.int64)
|
||||
pen_cols_arr = pen_cols
|
||||
pen_vals_arr = np.tile(
|
||||
[sqrt_lambda, -2.0 * sqrt_lambda, sqrt_lambda], n_pen
|
||||
).astype(np.float64)
|
||||
pen_vals_arr = pen_vals
|
||||
pen_b = np.zeros(n_pen, dtype=np.float64)
|
||||
n_total_rows = n + n_pen
|
||||
|
||||
|
|
@ -252,7 +265,11 @@ def compute_indices_for_level(pairs: pl.DataFrame, group_col: str):
|
|||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
if idx:
|
||||
indices[key] = idx
|
||||
n_pairs[key] = len(y1)
|
||||
# Count only information-bearing pairs: same-year (year1==year2) and
|
||||
# baseline-baseline pairs cancel in the sparse solve and contribute
|
||||
# zero information to the annual index, so including them would
|
||||
# inflate the shrinkage weight n/(n+k) and under-shrink noisy sectors.
|
||||
n_pairs[key] = int(np.count_nonzero(y2 != y1))
|
||||
return indices, n_pairs
|
||||
|
||||
|
||||
|
|
@ -433,20 +450,17 @@ def build_index(
|
|||
f" {len(area_idx)} areas, {len(district_idx)} districts, {len(sector_idx)} sectors"
|
||||
)
|
||||
|
||||
# Re-anchor every repeat-sales dict to the global base year before any
|
||||
# shrinkage/smoothing/blending. solve_robust_index anchors each cell to
|
||||
# log-index 0 at its OWN earliest year, so cells with shorter histories
|
||||
# are measured from a later origin; combining them key-by-key would
|
||||
# otherwise average level-incompatible numbers. The hedonic fallback is
|
||||
# already anchored at min_year, so we align everything to min_year.
|
||||
national_idx = reanchor_dict(national_idx, min_year)
|
||||
area_idx = reanchor_dicts(area_idx, min_year)
|
||||
district_idx = reanchor_dicts(district_idx, min_year)
|
||||
sector_idx = reanchor_dicts(sector_idx, min_year)
|
||||
|
||||
# Shrinkage: national -> hedonic first, then hierarchical
|
||||
# Shrinkage: national -> hedonic first, then hierarchical. Each cell is
|
||||
# anchored to log-index 0 at its OWN earliest year (solve_robust_index),
|
||||
# so cells with shorter histories sit on a later origin than their wider
|
||||
# parents. Before each blend we lift the child onto its parent's base at
|
||||
# the child's first year (lift_onto_parent) -- otherwise combining them
|
||||
# key-by-key averages level-incompatible numbers. The hedonic fallback is
|
||||
# anchored at the global min_year, so it serves as the base for national.
|
||||
print(" Applying shrinkage...")
|
||||
national_shrunk = shrink_dicts(national_idx, hedonic_idx, national_n)
|
||||
national_shrunk = shrink_dicts(
|
||||
lift_onto_parent(national_idx, hedonic_idx), hedonic_idx, national_n
|
||||
)
|
||||
sector_shrunk = hierarchical_shrinkage(
|
||||
sector_idx,
|
||||
sector_n,
|
||||
|
|
@ -459,6 +473,7 @@ def build_index(
|
|||
sector_to_dist,
|
||||
dist_to_area,
|
||||
shrink_dicts,
|
||||
lift_onto_parent,
|
||||
)
|
||||
|
||||
# Spatial smoothing
|
||||
|
|
|
|||
|
|
@ -142,6 +142,20 @@ def _sale_identity_matches(
|
|||
target_price: float,
|
||||
target_sale_date: int,
|
||||
) -> np.ndarray:
|
||||
"""Mark pool comparables that are (almost certainly) the target's own sale.
|
||||
|
||||
properties.parquet has no per-property id, so a sale is identified by the
|
||||
proxy tuple (postcode, price within 0.5, sale_date) to keep a target's own
|
||||
prior sale out of its comparable set (leakage prevention).
|
||||
|
||||
Limitation: new-build / bulk blocks sell many DISTINCT properties in one
|
||||
postcode on the same day at the same price, so all such siblings collide on
|
||||
this proxy and are excluded together. This is intentional conservative
|
||||
over-exclusion: it guarantees no leakage at the cost of occasionally
|
||||
dropping legitimate same-(postcode, price, date) siblings. The effect is
|
||||
bounded (~1.8% of the pool) and a precise fix would require a per-property
|
||||
id that the data does not carry.
|
||||
"""
|
||||
if not target_postcode or not np.isfinite(target_price) or target_sale_date < 0:
|
||||
return np.zeros(len(pool_postcodes), dtype=bool)
|
||||
return (
|
||||
|
|
@ -166,6 +180,16 @@ def knn_median_psm(
|
|||
|
||||
PSM is at the reference date used when building the pool.
|
||||
NaN where not computable (missing coords, unknown type, too few neighbors).
|
||||
|
||||
Coordinate limitation: lat/lon come from postcode.parquet (one centroid per
|
||||
postcode), so every property within a postcode is co-located. For a dense
|
||||
postcode the "k nearest" therefore degenerates into an arbitrary
|
||||
same-postcode subset whose membership is decided by KDTree index order
|
||||
rather than true proximity. No property-level coordinates exist to fix this,
|
||||
so the kNN signal is treated as a weak, noisy prior: the downstream guarded
|
||||
blend (guarded_blend_estimates) only blends kNN when it is close to the
|
||||
index estimate and otherwise discards it, bounding the impact of this
|
||||
degeneracy. The result is deterministic for a fixed pool order.
|
||||
"""
|
||||
n = len(lat)
|
||||
result = np.full(n, np.nan)
|
||||
|
|
|
|||
|
|
@ -36,26 +36,43 @@ def _base_value(index: dict[int, float], base_year: int) -> float:
|
|||
return index[prior[-1]]
|
||||
|
||||
|
||||
def reanchor_dict(index: dict[int, float], base_year: int) -> dict[int, float]:
|
||||
"""Re-anchor an index dict so index[base_year] == 0 (pure constant shift).
|
||||
def lift_onto_parent(
|
||||
child: dict[int, float], parent: dict[int, float]
|
||||
) -> dict[int, float]:
|
||||
"""Lift a child index onto its parent's base before blending the two.
|
||||
|
||||
Subtracting the same constant from every year preserves all within-dict
|
||||
year-to-year differences, so estimate.py's (current - sale) semantics are
|
||||
unchanged; it only fixes the cross-dict level mismatch before blending.
|
||||
solve_robust_index anchors every cell to log-index 0 at its OWN earliest
|
||||
year, so a cell with a shorter history sits on a later origin than its
|
||||
(wider) parent. Combining them key-by-key would average level-incompatible
|
||||
numbers (a sector measured from 2008 blended with a district measured from
|
||||
1996). We add the parent's accumulated level at the child's first year, so
|
||||
``child[start] == parent[start]``: the child's own year-to-year moves are
|
||||
layered on top of the parent's growth up to that point -- the same
|
||||
assumption shrinkage already makes for years the child lacks.
|
||||
|
||||
Re-basing on each cell's OWN earliest year (rather than the global base,
|
||||
which the child cannot observe) is what makes this effective: subtracting
|
||||
the child's value at the global base is always 0 and changes nothing.
|
||||
|
||||
The shift is a single constant added to every year of the child, so the
|
||||
child's own year-to-year differences are preserved. PRECONDITION for the
|
||||
downstream estimate to be unaffected within the child's range: the parent's
|
||||
year coverage must be a superset of the child's. This holds throughout
|
||||
build_index, where each parent aggregates a superset of its children's sale
|
||||
pairs, so shrink_dicts blends every child year against a present parent year
|
||||
and the constant shift cancels in a within-range (current - sale) difference;
|
||||
only comparisons that span the child's start year (e.g. a sale predating the
|
||||
cell's own data) change. If a caller violates the precondition (a child year
|
||||
the parent lacks), shrink_dicts passes that year through unshrunk and the
|
||||
cancellation no longer holds.
|
||||
"""
|
||||
if not index:
|
||||
return index
|
||||
shift = _base_value(index, base_year)
|
||||
if shift == 0.0:
|
||||
return index
|
||||
return {y: v - shift for y, v in index.items()}
|
||||
|
||||
|
||||
def reanchor_dicts(
|
||||
indices: dict[str, dict[int, float]], base_year: int
|
||||
) -> dict[str, dict[int, float]]:
|
||||
"""Re-anchor every index dict in a mapping to the common `base_year`."""
|
||||
return {key: reanchor_dict(idx, base_year) for key, idx in indices.items()}
|
||||
if not child or not parent:
|
||||
return child
|
||||
child_start = min(child)
|
||||
offset = _base_value(parent, child_start) - child[child_start]
|
||||
if offset == 0.0:
|
||||
return child
|
||||
return {y: v + offset for y, v in child.items()}
|
||||
|
||||
|
||||
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
|
||||
|
|
@ -84,30 +101,40 @@ def hierarchical_shrinkage(
|
|||
sector_to_dist: dict[str, str],
|
||||
dist_to_area: dict[str, str],
|
||||
shrink_fn: Callable[[V, V, int], V],
|
||||
lift_fn: Callable[[V, V], V] | None = None,
|
||||
) -> dict[str, V]:
|
||||
"""Top-down hierarchical shrinkage: area->top, district->area, sector->district.
|
||||
|
||||
`top_level` is the ultimate fallback value (e.g. national shrunk toward hedonic,
|
||||
or just national). `shrink_fn(raw, parent, n)` blends raw toward parent.
|
||||
`lift_fn(raw, parent)`, if given, re-bases raw onto its parent before blending
|
||||
(see lift_onto_parent); pass None for category-keyed dicts where re-basing is
|
||||
meaningless.
|
||||
"""
|
||||
|
||||
def combine(raw: V, parent: V, n: int) -> V:
|
||||
if lift_fn is not None:
|
||||
raw = lift_fn(raw, parent)
|
||||
return shrink_fn(raw, parent, n)
|
||||
|
||||
# Area -> top level
|
||||
area_shrunk = {}
|
||||
for area, val in area_vals.items():
|
||||
area_shrunk[area] = shrink_fn(val, top_level, area_n[area])
|
||||
area_shrunk[area] = combine(val, top_level, area_n[area])
|
||||
|
||||
# District -> area
|
||||
district_shrunk = {}
|
||||
for dist, val in district_vals.items():
|
||||
a = dist_to_area.get(dist, "")
|
||||
parent = area_shrunk.get(a, top_level)
|
||||
district_shrunk[dist] = shrink_fn(val, parent, district_n[dist])
|
||||
district_shrunk[dist] = combine(val, parent, district_n[dist])
|
||||
|
||||
# Sector -> district
|
||||
sector_shrunk = {}
|
||||
for sec, val in sector_vals.items():
|
||||
d = sector_to_dist.get(sec, "")
|
||||
parent = district_shrunk.get(d, top_level)
|
||||
sector_shrunk[sec] = shrink_fn(val, parent, sector_n[sec])
|
||||
sector_shrunk[sec] = combine(val, parent, sector_n[sec])
|
||||
|
||||
# Fill sectors without their own values
|
||||
for sec in all_sectors:
|
||||
|
|
|
|||
135
pipeline/transform/price_estimation/test_index.py
Normal file
135
pipeline/transform/price_estimation/test_index.py
Normal file
|
|
@ -0,0 +1,135 @@
|
|||
import numpy as np
|
||||
import polars as pl
|
||||
|
||||
from pipeline.transform.price_estimation import index as index_mod
|
||||
from pipeline.transform.price_estimation.index import (
|
||||
compute_indices_for_level,
|
||||
solve_robust_index,
|
||||
)
|
||||
|
||||
|
||||
def _pairs_from_path(true_levels: dict[int, float]):
|
||||
"""Build adjacent-year repeat-sale pairs that exactly trace a known path.
|
||||
|
||||
Each consecutive pair's log_ratio is the difference of the true log-levels,
|
||||
so the solver should recover the levels exactly (relative to the min year).
|
||||
"""
|
||||
years = sorted(true_levels)
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
for a, b in zip(years[:-1], years[1:]):
|
||||
y1.append(a)
|
||||
y2.append(b)
|
||||
lr.append(true_levels[b] - true_levels[a])
|
||||
w.append(1.0)
|
||||
return (
|
||||
np.array(y1, dtype=np.int32),
|
||||
np.array(y2, dtype=np.int32),
|
||||
np.array(lr, dtype=np.float64),
|
||||
np.array(w, dtype=np.float64),
|
||||
)
|
||||
|
||||
|
||||
def test_solver_recovers_contiguous_path():
|
||||
"""A contiguous price path is recovered as log-levels relative to min_year.
|
||||
|
||||
Proves the IRLS solver is correct (and unchanged) for contiguous data: the
|
||||
spacing-aware penalty reduces to the standard [1,-2,1] for unit spacing.
|
||||
"""
|
||||
years = range(2010, 2021)
|
||||
true = {y: 0.04 * (y - 2010) for y in years} # smooth (zero curvature) ramp
|
||||
# Replicate each adjacent pair so MIN_PAIRS is comfortably met.
|
||||
y1, y2, lr, w = _pairs_from_path(true)
|
||||
y1 = np.tile(y1, 3)
|
||||
y2 = np.tile(y2, 3)
|
||||
lr = np.tile(lr, 3)
|
||||
w = np.tile(w, 3)
|
||||
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
|
||||
assert idx[2010] == 0.0 # baseline anchor
|
||||
for y in years:
|
||||
assert abs(idx[y] - (true[y] - true[2010])) < 1e-3
|
||||
|
||||
|
||||
def test_gap_spanning_level_jump_is_not_smoothed_into_a_ramp():
|
||||
"""FIX #5: a sharp true level jump across a multi-year gap is preserved.
|
||||
|
||||
Coverage is 2000,2001,2002 then 2015,2016 with cross-gap pairs encoding a
|
||||
sharp jump at the gap. The uniform [1,-2,1] curvature penalty treats
|
||||
(beta_2002, beta_2015, beta_2016) as three adjacent years and over-penalizes
|
||||
the genuine level jump, biasing beta_2015 down toward a smooth ramp. The
|
||||
spacing-aware second difference relaxes the penalty across the gap.
|
||||
"""
|
||||
# True log-levels relative to min_year (2000 anchored at 0).
|
||||
true = {
|
||||
2000: 0.0,
|
||||
2001: 0.05,
|
||||
2002: 0.10,
|
||||
2015: 1.10, # sharp +1.0 jump across the gap
|
||||
2016: 1.15,
|
||||
}
|
||||
|
||||
y1, y2, lr, w = [], [], [], []
|
||||
|
||||
def add(a, b, n=4):
|
||||
for _ in range(n):
|
||||
y1.append(a)
|
||||
y2.append(b)
|
||||
lr.append(true[b] - true[a])
|
||||
w.append(1.0)
|
||||
|
||||
# In-segment adjacent pairs.
|
||||
add(2000, 2001)
|
||||
add(2001, 2002)
|
||||
add(2015, 2016)
|
||||
# Cross-gap pairs consistent with the sharp jump.
|
||||
add(2002, 2015)
|
||||
add(2002, 2016)
|
||||
|
||||
y1 = np.array(y1, dtype=np.int32)
|
||||
y2 = np.array(y2, dtype=np.int32)
|
||||
lr = np.array(lr, dtype=np.float64)
|
||||
w = np.array(w, dtype=np.float64)
|
||||
|
||||
# Use a strong penalty to make the smoothing bias obvious.
|
||||
original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = 1.0
|
||||
try:
|
||||
idx = solve_robust_index(y1, y2, lr, w)
|
||||
finally:
|
||||
index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original
|
||||
|
||||
assert idx[2000] == 0.0 # baseline anchor
|
||||
# beta_2015 must stay near its true post-gap level, not get dragged down by a
|
||||
# spurious curvature penalty that treats the gap as a single-year step.
|
||||
assert abs(idx[2015] - true[2015]) < 0.05
|
||||
|
||||
|
||||
def test_n_pairs_counts_only_cross_year_pairs():
|
||||
"""FIX #12: same-year pairs carry zero index information and must not inflate
|
||||
the shrinkage weight; n_pairs counts only cross-year (year2 != year1) pairs."""
|
||||
rows = []
|
||||
|
||||
def add_pairs(group, year1, year2, n):
|
||||
for _ in range(n):
|
||||
rows.append(
|
||||
{
|
||||
"grp": group,
|
||||
"year1": year1,
|
||||
"year2": year2,
|
||||
"log_ratio": 0.03 * (year2 - year1),
|
||||
"weight": 1.0,
|
||||
}
|
||||
)
|
||||
|
||||
# 8 genuine cross-year pairs spanning enough years for a valid solve, plus 3
|
||||
# zero-information same-year pairs that must not be counted.
|
||||
add_pairs("g", 2010, 2011, 4)
|
||||
add_pairs("g", 2011, 2012, 4)
|
||||
add_pairs("g", 2012, 2012, 3) # same-year, zero info
|
||||
|
||||
pairs = pl.DataFrame(rows)
|
||||
indices, n_pairs = compute_indices_for_level(pairs, "grp")
|
||||
|
||||
assert "g" in indices
|
||||
assert n_pairs["g"] == 8 # not 11
|
||||
|
|
@ -71,9 +71,49 @@ def test_knn_excludes_same_sale_and_uses_stable_comparables():
|
|||
),
|
||||
)
|
||||
|
||||
# The five 900k same-postcode siblings share the target's (postcode, price,
|
||||
# date) identity proxy, so they are all excluded as comparables, leaving the
|
||||
# 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
|
||||
# INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
|
||||
# exists to distinguish a target's own resale from a distinct bulk-block
|
||||
# sibling sold same-day at the same price), not ideal behaviour -- see the
|
||||
# _sale_identity_matches docstring.
|
||||
assert psm[0] == 2_500.0
|
||||
|
||||
|
||||
def test_knn_median_psm_is_deterministic():
|
||||
"""Reproducibility guard (BUG #6): within-postcode neighbours are co-located
|
||||
(one centroid per postcode), so the kNN result for dense postcodes depends on
|
||||
an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
|
||||
two identical calls against the same trees/inputs return identical output, so
|
||||
future refactors cannot silently introduce run-to-run nondeterminism."""
|
||||
sale_date = date(2026, 1, 1)
|
||||
rows = [
|
||||
{
|
||||
"Postcode": "AA1 1AA",
|
||||
"Property type": "Detached",
|
||||
"lat": 51.5000 + i * 0.00001,
|
||||
"lon": -0.1000,
|
||||
"Total floor area (sqm)": 80.0,
|
||||
"Last known price": 200_000.0 + i * 1_000.0,
|
||||
"Date of last transaction": sale_date,
|
||||
}
|
||||
for i in range(40)
|
||||
]
|
||||
df = pl.DataFrame(rows)
|
||||
trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
|
||||
|
||||
args = dict(
|
||||
lat=np.array([51.5000, 51.5002]),
|
||||
lon=np.array([-0.1000, -0.1000]),
|
||||
type_groups=np.array(["Detached", "Detached"]),
|
||||
)
|
||||
first = knn_median_psm(trees, **args)
|
||||
second = knn_median_psm(trees, **args)
|
||||
|
||||
assert np.array_equal(first, second)
|
||||
|
||||
|
||||
def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
|
||||
blended = guarded_blend_estimates(
|
||||
index_est=np.array([120_000.0, 1_000_000.0]),
|
||||
|
|
|
|||
|
|
@ -1,99 +1,117 @@
|
|||
"""Regression tests for common-base-year re-anchoring before blending.
|
||||
"""Regression tests for parent-base lifting before hierarchical blending.
|
||||
|
||||
Each repeat-sales index dict is anchored to log-index 0 at its OWN earliest
|
||||
year. shrink_dicts / blend_dicts combine dicts key-by-key, so dicts anchored to
|
||||
different base years must be re-anchored to a single common base first, or the
|
||||
solve_robust_index anchors every repeat-sales cell to log-index 0 at its OWN
|
||||
earliest year, so a cell with a shorter history sits on a later origin than its
|
||||
(wider) parent. shrink_dicts / blend_dicts combine dicts key-by-key, so a child
|
||||
must first be lifted onto its parent's base at the child's first year, or the
|
||||
blend averages level-incompatible numbers (fix5-index-base-year).
|
||||
|
||||
Note: re-anchoring each cell to the *global* base year is a no-op on real data
|
||||
(a cell anchored to 0 at its own earliest year already reads 0 there, and the
|
||||
global base is never later), which is why the fix lifts onto the *parent* at the
|
||||
child's own start year instead.
|
||||
"""
|
||||
|
||||
from pipeline.transform.price_estimation.shrinkage import (
|
||||
blend_dicts,
|
||||
reanchor_dict,
|
||||
reanchor_dicts,
|
||||
hierarchical_shrinkage,
|
||||
lift_onto_parent,
|
||||
shrink_dicts,
|
||||
)
|
||||
from pipeline.transform.price_estimation.utils import SHRINKAGE_K
|
||||
|
||||
|
||||
def test_reanchor_is_pure_constant_shift_preserving_differences():
|
||||
"""Re-anchoring only shifts the origin; year-to-year moves are unchanged."""
|
||||
# Anchored at its own earliest year 2008.
|
||||
idx = {2008: 0.0, 2009: 0.10, 2010: 0.25, 2011: 0.40}
|
||||
def test_lift_rebases_late_starting_child_onto_parent():
|
||||
"""A child anchored at its own later start year is lifted to the parent's level there."""
|
||||
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
|
||||
# Sector with its own repeat-sales data only from 2016, anchored at 2016 = 0.
|
||||
sector = {2016: 0.0, 2024: 0.20}
|
||||
|
||||
reanchored = reanchor_dict(idx, 1996)
|
||||
# 1996 is before this dict's history -> back-fill earliest value (0.0),
|
||||
# so the shift is 0 and the dict is unchanged.
|
||||
assert reanchored[2008] == 0.0
|
||||
lifted = lift_onto_parent(sector, parent)
|
||||
|
||||
# Same shape, different exact-hit base year: anchoring at 2010 subtracts 0.25.
|
||||
reanchored_2010 = reanchor_dict(idx, 2010)
|
||||
assert reanchored_2010[2010] == 0.0
|
||||
# All within-dict differences are preserved under the constant shift.
|
||||
years = sorted(idx)
|
||||
for a, b in zip(years, years[1:]):
|
||||
assert abs((reanchored_2010[b] - reanchored_2010[a]) - (idx[b] - idx[a])) < 1e-12
|
||||
# child[start] now equals the parent's accumulated level at that year.
|
||||
assert abs(lifted[2016] - parent[2016]) < 1e-12 # 1.20
|
||||
assert abs(lifted[2024] - (parent[2016] + 0.20)) < 1e-12 # 1.40
|
||||
# Pure constant shift: the child's own year-to-year move is preserved.
|
||||
assert abs((lifted[2024] - lifted[2016]) - (sector[2024] - sector[2016])) < 1e-12
|
||||
|
||||
|
||||
def test_blend_different_base_years_needs_reanchoring():
|
||||
"""Blending two dicts on different bases is biased unless re-anchored first.
|
||||
def test_lift_is_noop_when_child_starts_at_parent_base():
|
||||
"""A child whose earliest year is the parent's base (value 0) is unchanged."""
|
||||
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20}
|
||||
child = {1996: 0.0, 2008: 0.75, 2016: 1.10}
|
||||
assert lift_onto_parent(child, parent) == child
|
||||
|
||||
Both cells observe the common base year 1996 but were anchored to DIFFERENT
|
||||
origins (sectorA at 1996, sectorB at 2008, as solve_robust_index would do for
|
||||
cells whose pair history starts at different years). They describe the SAME
|
||||
true trajectory measured from 1996, so a 50/50 blend should reproduce that
|
||||
common level. Pre-fix, blend_dicts mixes sectorB's 2008-relative numbers with
|
||||
sectorA's 1996-relative numbers, level-shifting the smoothed result.
|
||||
|
||||
def test_lift_handles_empty_inputs():
|
||||
assert lift_onto_parent({}, {2000: 0.0}) == {}
|
||||
assert lift_onto_parent({2000: 0.0}, {}) == {2000: 0.0}
|
||||
|
||||
|
||||
def test_lift_fixes_estimate_spanning_child_start_but_not_within_range():
|
||||
"""The lift corrects comparisons that span the cell's start year, and ONLY those.
|
||||
|
||||
A property sold in 2008 (before the sector's own data begins in 2016) and
|
||||
valued in 2024: pre-lift the shrunk index mixes a 2016-based sector level
|
||||
with 1996-based parent levels and badly understates the move. Comparisons
|
||||
wholly inside the sector's own range (2016->2024) are unchanged, because the
|
||||
lift is a pure constant shift that cancels in a within-cell difference.
|
||||
"""
|
||||
base_year = 1996
|
||||
parent = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
|
||||
sector = {2016: 0.0, 2024: 0.20} # own data starts 2016
|
||||
n = 30
|
||||
w = n / (n + SHRINKAGE_K)
|
||||
|
||||
# True log-levels relative to 1996 (identical trajectory for both cells).
|
||||
truth = {1996: 0.0, 2008: 0.80, 2012: 1.00}
|
||||
raw = shrink_dicts(sector, parent, n) # pre-fix: blend without lifting
|
||||
fixed = shrink_dicts(lift_onto_parent(sector, parent), parent, n)
|
||||
|
||||
# sectorA: anchored at 1996 (its earliest year) -> equals truth.
|
||||
sector_a = dict(truth)
|
||||
# sectorB: same trajectory but anchored at 2008 (subtract truth[2008] from
|
||||
# every year), exactly how solve_robust_index would express a cell whose
|
||||
# earliest year happened to be picked as 2008.
|
||||
shift_b = truth[2008]
|
||||
sector_b = {y: v - shift_b for y, v in truth.items()}
|
||||
# Within the sector's own range the lift changes nothing.
|
||||
assert abs((fixed[2024] - fixed[2016]) - (raw[2024] - raw[2016])) < 1e-12
|
||||
|
||||
# --- Pre-fix behaviour: blend the raw dicts directly. ---
|
||||
raw_blend = blend_dicts(sector_a, [sector_b], 0.5, [0.5])
|
||||
# Every year is pulled by half of shift_b (0.4) away from the truth.
|
||||
assert abs(raw_blend[2012] - truth[2012]) > 0.3
|
||||
assert abs(raw_blend[1996] - truth[1996]) > 0.3
|
||||
# 2008 is parent-only in both (sector absent), so both read parent[2008].
|
||||
assert abs(raw[2008] - parent[2008]) < 1e-12
|
||||
assert abs(fixed[2008] - parent[2008]) < 1e-12
|
||||
|
||||
# --- Post-fix behaviour: re-anchor to the common base, THEN blend. ---
|
||||
reanchored = reanchor_dicts({"A": sector_a, "B": sector_b}, base_year)
|
||||
fixed_blend = blend_dicts(reanchored["A"], [reanchored["B"]], 0.5, [0.5])
|
||||
# Both cells now read 0 at 1996 and the true level at every shared year.
|
||||
for y in truth:
|
||||
assert abs(fixed_blend[y] - truth[y]) < 1e-9
|
||||
raw_move = raw[2024] - raw[2008]
|
||||
fixed_move = fixed[2024] - fixed[2008]
|
||||
# Hand-computed: raw[2024] = w*0.20 + (1-w)*1.50; fixed[2024] = w*1.40 + (1-w)*1.50.
|
||||
assert abs(raw_move - ((w * 0.20 + (1 - w) * 1.50) - 0.80)) < 1e-12
|
||||
assert abs(fixed_move - ((w * 1.40 + (1 - w) * 1.50) - 0.80)) < 1e-12
|
||||
# The fix raises the spanning move by exactly the parent growth to the
|
||||
# sector's start year that the raw blend dropped (weighted by w).
|
||||
assert abs((fixed_move - raw_move) - w * parent[2016]) < 1e-12
|
||||
# Fixed move is close to the true area-level move (0.70); raw badly understates it.
|
||||
assert abs(fixed_move - 0.70) < 0.2
|
||||
assert raw_move < 0.4 * fixed_move
|
||||
|
||||
|
||||
def test_shrink_dicts_after_reanchoring_is_consistent():
|
||||
"""Shrinking a cell toward its parent must use a common origin."""
|
||||
base_year = 2000
|
||||
# Parent (national) anchored at 2000.
|
||||
parent = {2000: 0.0, 2010: 0.50, 2020: 1.20}
|
||||
# Sector tracking the parent exactly but anchored at 2010 (subtract 0.50 from
|
||||
# every year), as solve_robust_index would express a cell whose earliest year
|
||||
# is later. It still observes the 2000 base year (value -0.50).
|
||||
sector = {2000: -0.50, 2010: 0.0, 2020: 0.70}
|
||||
n = 0 # no own data weight -> result should equal parent after anchoring
|
||||
def test_hierarchical_shrinkage_lift_fn_only_changes_spanning_comparisons():
|
||||
"""Integration: passing lift_fn re-bases a late-starting sector via its parent chain."""
|
||||
top = {1996: 0.0, 2008: 0.80, 2016: 1.20, 2024: 1.50}
|
||||
sector = {"AB1 1": {2016: 0.0, 2024: 0.20}}
|
||||
sector_n = {"AB1 1": 300}
|
||||
# No own area/district indices -> the sector shrinks straight toward `top`.
|
||||
base_args = (
|
||||
sector,
|
||||
sector_n,
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
{},
|
||||
top,
|
||||
["AB1 1"],
|
||||
{"AB1 1": "AB1"},
|
||||
{"AB1": "AB"},
|
||||
shrink_dicts,
|
||||
)
|
||||
|
||||
reanchored_sector = reanchor_dict(sector, base_year)
|
||||
# Exact hit on 2000 subtracts -0.50, putting the sector back on the parent's
|
||||
# origin: 0.0 at 2000, 0.50 at 2010, 1.20 at 2020.
|
||||
shrunk = shrink_dicts(reanchored_sector, parent, n)
|
||||
assert abs(shrunk[2000] - 0.0) < 1e-9
|
||||
assert abs(shrunk[2010] - 0.50) < 1e-9
|
||||
assert abs(shrunk[2020] - 1.20) < 1e-9
|
||||
without_lift = hierarchical_shrinkage(*base_args)["AB1 1"]
|
||||
with_lift = hierarchical_shrinkage(*base_args, lift_onto_parent)["AB1 1"]
|
||||
|
||||
|
||||
def test_reanchor_exact_hit_shifts_all_years():
|
||||
"""When the base year is present, subtract its value from every year."""
|
||||
idx = {1996: 0.0, 2005: 0.30, 2015: 0.90}
|
||||
reanchored = reanchor_dict(idx, 2005)
|
||||
assert reanchored[2005] == 0.0
|
||||
assert abs(reanchored[1996] - (-0.30)) < 1e-12
|
||||
assert abs(reanchored[2015] - 0.60) < 1e-12
|
||||
# Within the sector's own range: identical (pure constant shift cancels).
|
||||
assert abs(
|
||||
(with_lift[2024] - with_lift[2016]) - (without_lift[2024] - without_lift[2016])
|
||||
) < 1e-12
|
||||
# Spanning the sector's start year: the lift raises the 2008->2024 move.
|
||||
assert (with_lift[2024] - with_lift[2008]) > (
|
||||
without_lift[2024] - without_lift[2008]
|
||||
) + 0.1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue