from datetime import date

import numpy as np
import polars as pl

from pipeline.transform.price_estimation.estimate import guarded_blend_estimates
from pipeline.transform.price_estimation.knn import build_knn_pool, knn_median_psm
from pipeline.transform.price_estimation.utils import TYPE_GROUPS, type_group_expr


def _flat_index() -> pl.DataFrame:
    return pl.DataFrame(
        {
            "sector": ["AA1 1", "AA1 1"],
            "type_group": ["Detached", "All"],
            "year": [2026, 2026],
            "log_index": [0.0, 0.0],
        }
    )


def test_knn_excludes_same_sale_and_uses_stable_comparables():
    sale_date = date(2026, 1, 1)
    rows = [
        {
            "Postcode": "AA1 1AA",
            "Property type": "Detached",
            "lat": 51.5000,
            "lon": -0.1000,
            "Total floor area (sqm)": 80.0,
            "Last known price": 900_000.0,
            "Date of last transaction": sale_date,
        }
    ]
    rows.extend(
        {
            "Postcode": "AA1 1AA",
            "Property type": "Detached",
            "lat": 51.5001 + i * 0.00001,
            "lon": -0.1001,
            "Total floor area (sqm)": 20.0,
            "Last known price": 900_000.0,
            "Date of last transaction": sale_date,
        }
        for i in range(5)
    )
    rows.extend(
        {
            "Postcode": f"AA1 1B{i}",
            "Property type": "Detached",
            "lat": 51.5010 + i * 0.00001,
            "lon": -0.1010,
            "Total floor area (sqm)": 80.0,
            "Last known price": 200_000.0,
            "Date of last transaction": sale_date,
        }
        for i in range(5)
    )
    df = pl.DataFrame(rows)

    trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)
    psm = knn_median_psm(
        trees,
        lat=np.array([51.5000]),
        lon=np.array([-0.1000]),
        type_groups=np.array(["Detached"]),
        postcodes=np.array(["AA1 1AA"]),
        last_prices=np.array([900_000.0]),
        last_sale_dates=np.array(
            [sale_date.toordinal() - date(1970, 1, 1).toordinal()]
        ),
    )

    # The five 900k same-postcode siblings share the target's (postcode, price,
    # date) identity proxy, so they are all excluded as comparables, leaving the
    # 200k/80sqm = 2_500 PSM neighbours. Removing same-identity siblings is an
    # INTENTIONAL conservative leakage-prevention tradeoff (no per-property id
    # exists to distinguish a target's own resale from a distinct bulk-block
    # sibling sold same-day at the same price), not ideal behaviour -- see the
    # _sale_identity_matches docstring.
    assert psm[0] == 2_500.0


def test_knn_median_psm_is_deterministic():
    """Reproducibility guard (BUG #6): within-postcode neighbours are co-located
    (one centroid per postcode), so the kNN result for dense postcodes depends on
    an arbitrary same-postcode subset. That is acceptable, but it MUST be stable:
    two identical calls against the same trees/inputs return identical output, so
    future refactors cannot silently introduce run-to-run nondeterminism."""
    sale_date = date(2026, 1, 1)
    rows = [
        {
            "Postcode": "AA1 1AA",
            "Property type": "Detached",
            "lat": 51.5000 + i * 0.00001,
            "lon": -0.1000,
            "Total floor area (sqm)": 80.0,
            "Last known price": 200_000.0 + i * 1_000.0,
            "Date of last transaction": sale_date,
        }
        for i in range(40)
    ]
    df = pl.DataFrame(rows)
    trees = build_knn_pool(df.lazy(), _flat_index(), 2026.0)

    args = dict(
        lat=np.array([51.5000, 51.5002]),
        lon=np.array([-0.1000, -0.1000]),
        type_groups=np.array(["Detached", "Detached"]),
    )
    first = knn_median_psm(trees, **args)
    second = knn_median_psm(trees, **args)

    assert np.array_equal(first, second)


def test_guarded_blend_routes_unstable_knn_to_index_and_caps_uplift():
    blended = guarded_blend_estimates(
        index_est=np.array([120_000.0, 1_000_000.0]),
        knn_est=np.array([5_000_000.0, 1_000_000.0]),
        last_prices=np.array([100_000.0, 100_000.0]),
    )

    # Property 0: unstable kNN (>2x index) is dropped, index estimate kept.
    assert blended[0] == 120_000.0
    # Property 1: a 10x uplift over the last price is legitimate appreciation and
    # is no longer truncated (cap raised from 6x to 20x).
    assert blended[1] == 1_000_000.0


def test_guarded_blend_caps_uplift_at_20x_last_price():
    # 50x index estimate over the last price is capped at the 20x ceiling.
    blended = guarded_blend_estimates(
        index_est=np.array([5_000_000.0]),
        knn_est=np.array([np.nan]),
        last_prices=np.array([100_000.0]),
    )
    assert blended[0] == 2_000_000.0  # 100_000 * 20


def test_bungalow_is_not_a_dead_price_index_type_group():
    df = pl.DataFrame({"Property type": ["Bungalow", "Other"]}).with_columns(
        type_group_expr()
    )

    assert "Bungalow" not in TYPE_GROUPS
    assert df["type_group"].to_list() == [None, None]


def test_temporal_regularization_damps_curvature_without_breaking_solve():
    """The second-difference prior reduces year-to-year curvature and keeps the
    index well-formed (all years present, finite, contiguous)."""
    from pipeline.transform.price_estimation import index as index_mod

    years = np.arange(2010, 2021)
    true = {y: 0.04 * (y - 2010) for y in years}
    y1, y2, lr, w = [], [], [], []
    for y in years[:-1]:  # adjacent-year pairs following a smooth trend
        y1.append(y)
        y2.append(y + 1)
        lr.append(true[y + 1] - true[y])
        w.append(1.0)
    # A spurious single-year jump at 2015 (poorly identified curvature spike).
    y1.append(2014)
    y2.append(2015)
    lr.append(0.5)
    w.append(1.0)
    y1, y2 = np.array(y1), np.array(y2)
    lr, w = np.array(lr, float), np.array(w, float)

    def solve(lmbda):
        original = index_mod.TEMPORAL_SMOOTHNESS_LAMBDA
        index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = lmbda
        try:
            return index_mod.solve_robust_index(y1, y2, lr, w)
        finally:
            index_mod.TEMPORAL_SMOOTHNESS_LAMBDA = original

    unregularised = solve(0.0)
    regularised = solve(0.2)

    # Index is well-formed for both.
    assert set(regularised) == set(range(2010, 2021))
    assert all(np.isfinite(v) for v in regularised.values())
    assert regularised[2010] == 0.0  # baseline year pinned to 0

    def max_curvature(d):
        betas = np.array([d[y] for y in sorted(d)])
        return float(np.abs(np.diff(betas, 2)).max())

    # Regularisation strictly reduces curvature, and never flattens the genuine
    # uptrend (the index still rises end to end).
    assert max_curvature(regularised) < max_curvature(unregularised)
    assert regularised[2020] > regularised[2010]