import logging
from datetime import datetime
from pathlib import Path

import polars as pl

from constants import MAX_BEDROOMS, MAX_RENT_MONTHLY, MIN_RENT_MONTHLY
from transform import map_property_type, normalize_price

log = logging.getLogger("rightmove")


def write_parquet(properties: list[dict], path: Path, channel: str) -> None:
    """Write properties list to parquet with server-ready column names.

    channel: "buy" or "rent"
    """
    if not properties:
        log.warning("No properties to write to %s", path)
        return

    # Sanitize bedroom/bathroom counts — values above MAX_BEDROOMS are
    # almost certainly prices or other numeric fields mis-parsed as bedrooms.
    bad_count = 0
    for p in properties:
        for key in ("Bedrooms", "Bathrooms"):
            val = p.get(key, 0) or 0
            if val > MAX_BEDROOMS:
                bad_count += 1
                p[key] = None
        # Recompute derived field after sanitization
        beds = p.get("Bedrooms")
        baths = p.get("Bathrooms")
        if beds is None or baths is None:
            p["Number of bedrooms & living rooms"] = None
        else:
            p["Number of bedrooms & living rooms"] = beds + baths

    if bad_count:
        log.warning(
            "Sanitized %d properties with bedroom/bathroom counts > %d (set to null)",
            bad_count,
            MAX_BEDROOMS,
        )

    # Re-derive Property type from Property sub-type using current PROPERTY_TYPE_MAP.
    # This retroactively fixes data scraped with older versions of the type map.
    remapped = 0
    for p in properties:
        sub_type = p.get("Property sub-type", "")
        if sub_type and sub_type != "Unknown":
            new_type = map_property_type(sub_type)
            if new_type != p.get("Property type"):
                p["Property type"] = new_type
                remapped += 1
    if remapped:
        log.info("Re-mapped %d property types from sub-types", remapped)

    # Parse first_visible_date to datetime
    listing_dates = []
    for p in properties:
        fvd = p.get("first_visible_date", "")
        if fvd:
            try:
                dt = datetime.fromisoformat(fvd.replace("Z", "+00:00"))
                # Convert to UTC naive datetime for consistent storage
                if dt.tzinfo is not None:
                    from datetime import timezone
                    dt = dt.astimezone(timezone.utc).replace(tzinfo=None)
                listing_dates.append(dt)
            except (ValueError, TypeError):
                # Try additional date formats (OpenRent: "DD Month, YYYY", "Today")
                parsed = None
                stripped = fvd.strip()
                lower = stripped.lower()
                if lower == "today":
                    parsed = datetime.now().replace(
                        hour=0, minute=0, second=0, microsecond=0
                    )
                elif lower == "tomorrow":
                    from datetime import timedelta
                    parsed = (
                        datetime.now() + timedelta(days=1)
                    ).replace(hour=0, minute=0, second=0, microsecond=0)
                else:
                    for fmt in ("%d %B, %Y", "%d %B %Y"):
                        try:
                            parsed = datetime.strptime(stripped, fmt)
                            break
                        except ValueError:
                            continue
                listing_dates.append(parsed)
        else:
            listing_dates.append(None)

    # Derive asking price / asking rent based on channel
    # Zero prices indicate parsing failures or POA/auction listings — treat as null
    if channel == "buy":
        asking_prices = [p["price"] if p["price"] > 0 else None for p in properties]
        asking_rents = [None] * len(properties)
        listing_statuses = ["For sale"] * len(properties)
    else:
        asking_prices = [None] * len(properties)
        # Normalize to monthly, then apply sanity bounds. Rents outside
        # [MIN_RENT_MONTHLY, MAX_RENT_MONTHLY] are almost always total-stay
        # pricing (short lets), annual rents mislabelled as monthly, or £0
        # placeholders — null them out rather than polluting aggregates.
        rent_outliers = 0
        asking_rents = []
        for p in properties:
            monthly = normalize_price(p["price"], p["price_frequency"])
            if monthly < MIN_RENT_MONTHLY or monthly > MAX_RENT_MONTHLY:
                rent_outliers += 1
                asking_rents.append(None)
            else:
                asking_rents.append(monthly)
        if rent_outliers:
            log.warning(
                "Nulled %d rent outliers outside [£%d, £%d]/month",
                rent_outliers,
                MIN_RENT_MONTHLY,
                MAX_RENT_MONTHLY,
            )
        listing_statuses = ["For rent"] * len(properties)

    df = pl.DataFrame(
        {
            "Bedrooms": [p["Bedrooms"] for p in properties],
            "Bathrooms": [p["Bathrooms"] for p in properties],
            "Number of bedrooms & living rooms": [
                p["Number of bedrooms & living rooms"] for p in properties
            ],
            "lon": [p["lon"] for p in properties],
            "lat": [p["lat"] for p in properties],
            "Postcode": [p["Postcode"] for p in properties],
            "Address per Property Register": [
                p["Address per Property Register"] for p in properties
            ],
            "Leasehold/Freehold": [p["Leasehold/Freehold"] for p in properties],
            "Property type": [p["Property type"] for p in properties],
            "Property sub-type": [p["Property sub-type"] for p in properties],
            "Price qualifier": [p["Price qualifier"] for p in properties],
            "Total floor area (sqm)": [p["Total floor area (sqm)"] for p in properties],
            "Listing URL": [p["Listing URL"] for p in properties],
            "Listing features": [p["Listing features"] for p in properties],
            "Listing date": listing_dates,
            "Listing status": listing_statuses,
            "Asking price": asking_prices,
            "Asking rent (monthly)": asking_rents,
        },
        schema={
            "Bedrooms": pl.Int32,
            "Bathrooms": pl.Int32,
            "Number of bedrooms & living rooms": pl.Int32,
            "lon": pl.Float64,
            "lat": pl.Float64,
            "Postcode": pl.Utf8,
            "Address per Property Register": pl.Utf8,
            "Leasehold/Freehold": pl.Utf8,
            "Property type": pl.Utf8,
            "Property sub-type": pl.Utf8,
            "Price qualifier": pl.Utf8,
            "Total floor area (sqm)": pl.Float64,
            "Listing URL": pl.Utf8,
            "Listing features": pl.List(pl.Utf8),
            "Listing date": pl.Datetime("us"),
            "Listing status": pl.Utf8,
            "Asking price": pl.Int64,
            "Asking rent (monthly)": pl.Int64,
        },
    )

    # Derive asking price per sqm for buy listings
    if channel == "buy":
        df = df.with_columns(
            (pl.col("Asking price") / pl.col("Total floor area (sqm)"))
            .round(0)
            .cast(pl.Int32, strict=False)
            .alias("Asking price per sqm"),
        )

    df.write_parquet(path)
    log.info("Wrote %d properties to %s", len(df), path)