"""Shared utilities for price estimation modules."""

from datetime import date

import numpy as np
import polars as pl

CURRENT_YEAR = 2026
# Latest COMPLETE calendar year. The current year's transactions are only
# partially reported (Land Registry lags ~2-3 months), so a sector's thin
# partial-year repeat-sale set produces wild index betas (e.g. +334% in a
# single sector). The index is SOLVED only on complete years (<= this) and
# forward-filled/trend-extrapolated to CURRENT_YEAR, so current-value
# projections follow the established trend instead of a partial-year spike.
LATEST_COMPLETE_YEAR = CURRENT_YEAR - 1
_today = date.today()
CURRENT_FRAC_YEAR = _today.year + (_today.month - 1) / 12

# Cap on log(index_ratio) to prevent wild estimates from thin sectors
MAX_LOG_ADJUSTMENT = 3.0  # ~20x max price change
TERRACE_TYPES = [
    "Mid-Terrace",
    "End-Terrace",
    "Enclosed Mid-Terrace",
    "Enclosed End-Terrace",
    "Terraced",
]
FLAT_TYPES = ["Flats/Maisonettes"]
TYPE_GROUPS = ["Detached", "Semi-Detached", "Terraced", "Flats"]
SHRINKAGE_K = 50

# Temporal regularization for the repeat-sales index: a second-difference
# (curvature) penalty lambda * sum((beta_t - 2*beta_{t-1} + beta_{t-2})^2) added
# to the IRLS solve. A mild penalty damps single-year index spikes (which would
# otherwise distort the estimate of any property whose last sale landed on a
# noisy year) without flattening genuine multi-year trends.
TEMPORAL_SMOOTHNESS_LAMBDA = 0.05

# Per-year support scaling for the temporal smoothness penalty. A flat lambda
# is too weak for years with very few repeat-sale pairs: a sector can have
# hundreds of pairs overall (so cell-level n/(n+k) shrinkage barely moves it)
# yet have individual years estimated from 1-2 pairs, producing 2-7x
# single-year index spikes. Each curvature row is therefore scaled by the
# local pair support of its year triple:
#   lambda_eff = lambda0 * (1 + SMOOTHNESS_SUPPORT_PAIRS / s)
# where s is the minimum cross-year pair count among the triple's years.
# Well-supported years (s >> SMOOTHNESS_SUPPORT_PAIRS) keep lambda_eff ~
# lambda0 (current behaviour); a year identified by a single pair gets
# ~41x lambda0, pulling its beta strongly toward the local trend through its
# neighbours. Same-year pairs cancel in the design and are not counted.
SMOOTHNESS_SUPPORT_PAIRS = 40


def type_group_expr():
    """Polars expression: Property type -> type_group."""
    return (
        pl.when(pl.col("Property type").is_in(TERRACE_TYPES))
        .then(pl.lit("Terraced"))
        .when(pl.col("Property type").is_in(FLAT_TYPES))
        .then(pl.lit("Flats"))
        .when(pl.col("Property type").is_in(["Detached", "Semi-Detached"]))
        .then(pl.col("Property type"))
        .otherwise(pl.lit(None))
        .alias("type_group")
    )


def sector_expr():
    """Polars expression: Postcode -> sector (drop last 2 chars, strip)."""
    return (
        pl.col("Postcode")
        .str.slice(0, pl.col("Postcode").str.len_chars() - 2)
        .str.strip_chars()
        .alias("sector")
    )


def hierarchy_keys(sector: str) -> tuple[str, str]:
    """Return (district, area) for a sector string."""
    district = sector.rsplit(" ", 1)[0] if " " in sector else sector
    area = ""
    for ch in district:
        if ch.isalpha():
            area += ch
        else:
            break
    return district, area


NON_REF_TYPES = ["Terraced", "Semi-Detached", "Flats"]


def build_hedonic_features(df: pl.DataFrame) -> np.ndarray:
    """Build hedonic feature matrix: log(floor_area) + 4 type dummies (ref: Detached)."""
    fa = df["Total floor area (sqm)"].to_numpy().astype(np.float32)
    log_fa = np.log(np.maximum(fa, 1.0)).reshape(-1, 1)
    tg = df["type_group"].to_numpy()
    parts = [log_fa]
    for t in NON_REF_TYPES:
        parts.append((tg == t).astype(np.float32).reshape(-1, 1))
    return np.hstack(parts)


def interpolate_log_index(
    index: pl.DataFrame,
    df: pl.DataFrame,
    sector_col: str,
    type_col: str,
    frac_year_col: str,
    output_alias: str,
) -> pl.DataFrame:
    """Join and interpolate log_index at fractional years.

    For frac_year 2019.75: joins index at year=2019 and year=2020,
    then linearly interpolates: 0.25*idx_2019 + 0.75*idx_2020.
    Falls back to floor or ceil when the other is missing.
    """
    floor_col = f"_{output_alias}_floor"
    ceil_col = f"_{output_alias}_ceil"
    floor_year = f"_{output_alias}_floor_year"
    ceil_year = f"_{output_alias}_ceil_year"
    frac_col = f"_{output_alias}_frac"

    df = df.with_columns(
        pl.col(frac_year_col).floor().cast(pl.Int32).alias(floor_year),
        pl.col(frac_year_col).ceil().cast(pl.Int32).alias(ceil_year),
        (pl.col(frac_year_col) - pl.col(frac_year_col).floor()).alias(frac_col),
    )

    df = join_type_stratified_index(
        df, index, sector_col, type_col, floor_year, floor_col
    )
    df = join_type_stratified_index(
        df, index, sector_col, type_col, ceil_year, ceil_col
    )

    # Interpolate: (1-frac)*floor + frac*ceil, with fallbacks
    df = df.with_columns(
        pl.when(pl.col(floor_col).is_not_null() & pl.col(ceil_col).is_not_null())
        .then(
            (1.0 - pl.col(frac_col)) * pl.col(floor_col)
            + pl.col(frac_col) * pl.col(ceil_col)
        )
        .when(pl.col(floor_col).is_not_null())
        .then(pl.col(floor_col))
        .when(pl.col(ceil_col).is_not_null())
        .then(pl.col(ceil_col))
        .otherwise(pl.lit(None))
        .alias(output_alias),
    ).drop(floor_col, ceil_col, floor_year, ceil_year, frac_col)

    return df


def extract_centroids(input_path) -> dict[str, tuple[float, float]]:
    """Compute mean lat/lon per postcode sector."""
    print("Computing sector centroids...")
    df = (
        pl.scan_parquet(input_path)
        .select("Postcode", "lat", "lon")
        .filter(pl.col("Postcode").is_not_null(), pl.col("lat").is_not_null())
        .with_columns(sector_expr())
        .group_by("sector")
        .agg(pl.col("lat").mean(), pl.col("lon").mean())
        .collect()
    )
    centroids = {}
    for row in df.iter_rows(named=True):
        centroids[row["sector"]] = (row["lat"], row["lon"])
    print(f"  {len(centroids):,} sector centroids")
    return centroids


def join_type_stratified_index(
    df: pl.DataFrame,
    index: pl.DataFrame,
    sector_col: str,
    type_col: str,
    year_col: str,
    output_alias: str,
) -> pl.DataFrame:
    """Join price index with typed->All fallback. Returns df with `output_alias` column."""
    idx_typed = index.filter(pl.col("type_group") != "All")
    idx_all = index.filter(pl.col("type_group") == "All")

    _typed = f"_{output_alias}_typed"
    _all = f"_{output_alias}_all"

    df = df.join(
        idx_typed.select(
            "sector", "type_group", "year", pl.col("log_index").alias(_typed)
        ),
        left_on=[sector_col, type_col, year_col],
        right_on=["sector", "type_group", "year"],
        how="left",
    ).join(
        idx_all.select("sector", "year", pl.col("log_index").alias(_all)),
        left_on=[sector_col, year_col],
        right_on=["sector", "year"],
        how="left",
    )

    df = df.with_columns(
        pl.col(_typed).fill_null(pl.col(_all)).alias(output_alias),
    ).drop(_typed, _all)

    return df