Fable findings in data

This commit is contained in:
Andras Schmelczer 2026-06-11 07:49:23 +01:00
parent b98bc6d611
commit 6a33b03fdf
20 changed files with 1502 additions and 274 deletions

View file

@ -12,6 +12,18 @@ V = TypeVar("V")
SPATIAL_NEIGHBORS = 5
SPATIAL_BLEND_K = 30
# Hard band on a sector's per-year index move RELATIVE to its parent (the
# national index), enforced by winsorize_steps after spatial smoothing. The
# support-scaled temporal smoothness prior still under-penalises years
# identified by only 1-2 repeat-sale pairs in thin early histories, leaving
# artefacts like a x9.7 single-year jump (log +2.27, sector "M3 1"
# 1998->1999). A sector may genuinely outpace the nation -- regeneration, new
# transport links -- but those stories play out over multiple years, not as a
# one-year x9.7 step. +/-0.40 log/yr (~x1.5 in a year) relative to the
# national move keeps every plausible genuine sector-level divergence while
# clamping thin-year data artefacts.
MAX_STEP_DEVIATION_PER_YEAR = 0.40
def _base_value(index: dict[int, float], base_year: int) -> float:
"""Value of an index dict at `base_year`, with forward/back-fill for gaps.
@ -75,6 +87,42 @@ def lift_onto_parent(
return {y: v + offset for y, v in child.items()}
def winsorize_steps(
child: dict[int, float],
parent: dict[int, float],
max_dev_per_year: float,
) -> dict[int, float]:
"""Clamp a child's per-year index steps to within a band of the parent's.
For each consecutive pair of solved years (y_prev, y) the child's per-year
rate r = (child[y] - child[y_prev]) / (y - y_prev) is winsorised into
[p - max_dev_per_year, p + max_dev_per_year], where p is the parent's
per-year rate over the same span (via _base_value, so gaps in the parent's
coverage are forward/back-filled rather than crashing). The series is then
rebuilt cumulatively from the FIRST year's value, so:
- the first year's level is preserved;
- non-outlier steps are preserved exactly (later years simply shift by
whatever the clamped steps removed);
- a multi-year gap is judged on its per-year rate, not as one giant
single-year move, so genuine level changes across gaps survive.
A child with <2 years has no steps to clamp; an empty parent only occurs
in degenerate paths (build_index always passes the national index) -- both
are returned unchanged.
"""
if len(child) < 2 or not parent:
return child
years = sorted(child)
result = {years[0]: child[years[0]]}
for y_prev, y in zip(years[:-1], years[1:]):
span = y - y_prev
r = (child[y] - child[y_prev]) / span
p = (_base_value(parent, y) - _base_value(parent, y_prev)) / span
r = min(max(r, p - max_dev_per_year), p + max_dev_per_year)
result[y] = result[y_prev] + r * span
return result
def shrink_dicts(raw: dict, parent: dict, n: int) -> dict:
"""Shrink dict values toward parent using n/(n+k) weighting.