perfect-postcode/pipeline/transform/test_crime.py

import polars as pl

from pipeline.transform.crime import find_street_crime_csvs, transform_crime


def test_find_street_crime_csvs_ignores_archive_sidecars(tmp_path):
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
    month_dir.mkdir(parents=True)
    street = month_dir / "2024-01-test-force-street.csv"
    street.touch()
    (month_dir / "2024-01-test-force-outcomes.csv").touch()
    (month_dir / "2024-01-test-force-stop-and-search.csv").touch()
    (crime_dir / "notes.csv").touch()

    csvs, ignored_count = find_street_crime_csvs(crime_dir)

    assert csvs == [street]
    assert ignored_count == 3


def test_transform_crime_reads_only_street_crime_csvs(tmp_path):
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
    month_dir.mkdir(parents=True)

    (month_dir / "2024-01-test-force-street.csv").write_text(
        "\n".join(
            [
                "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
                "1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
                "2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
                "3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,,No LSOA,Robbery,Under investigation,",
            ]
        )
        + "\n"
    )
    (month_dir / "2024-01-test-force-outcomes.csv").write_text(
        "Crime ID,Month,Reported by,Outcome type\n1,2024-01,Test Force,Charged\n"
    )

    output = tmp_path / "crime.parquet"
    transform_crime(crime_dir, output)

    result = pl.read_parquet(output).to_dicts()

    assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 24.0}]


def test_transform_crime_annualises_over_all_valid_months(tmp_path):
    crime_dir = tmp_path / "crime"
    jan_dir = crime_dir / "2024-01"
    feb_dir = crime_dir / "2024-02"
    jan_dir.mkdir(parents=True)
    feb_dir.mkdir(parents=True)

    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
    (jan_dir / "2024-01-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
                "2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
                "3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
            ]
        )
        + "\n"
    )
    (feb_dir / "2024-02-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "4,2024-02,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
            ]
        )
        + "\n"
    )

    output = tmp_path / "crime.parquet"
    transform_crime(crime_dir, output)

    result = pl.read_parquet(output).sort("LSOA code").to_dicts()

    assert result == [
        {
            "LSOA code": "E01000001",
            "Burglary (avg/yr)": 12.0,
            "Robbery (avg/yr)": 0.0,
        },
        {
            "LSOA code": "E01000002",
            "Burglary (avg/yr)": 0.0,
            "Robbery (avg/yr)": 12.0,
        },
    ]


def test_transform_crime_writes_by_year_output(tmp_path):
    crime_dir = tmp_path / "crime"
    jan23 = crime_dir / "2023-01"
    jan24 = crime_dir / "2024-01"
    feb24 = crime_dir / "2024-02"
    for d in (jan23, jan24, feb24):
        d.mkdir(parents=True)

    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
    (jan23 / "2023-01-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "1,2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
                "2,2023-01,F,F,-0.1,51.5,X,E01000001,L,Robbery,U,",
            ]
        )
        + "\n"
    )
    (jan24 / "2024-01-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "3,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
                "4,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
            ]
        )
        + "\n"
    )
    (feb24 / "2024-02-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "5,2024-02,F,F,-0.1,51.5,X,E01000001,L,Anti-social behaviour,U,",
            ]
        )
        + "\n"
    )

    output = tmp_path / "crime.parquet"
    by_year_output = tmp_path / "crime_by_year.parquet"
    transform_crime(crime_dir, output, by_year_output)

    by_year = pl.read_parquet(by_year_output)
    assert by_year.height == 1
    cols = set(by_year.columns)
    assert "Burglary (by year)" in cols
    assert "Serious crime (by year)" in cols
    assert "Minor crime (by year)" in cols

    row = by_year.row(0, named=True)
    burglary = sorted(row["Burglary (by year)"], key=lambda r: r["year"])
    # 2023: 1 burglary in 1 month → 12/yr; 2024: 2 in 2 months → 12/yr
    assert burglary == [
        {"year": 2023, "count": 12.0},
        {"year": 2024, "count": 12.0},
    ]
    # Serious crime in 2023 = Burglary(12) + Robbery(12) = 24
    serious = {p["year"]: p["count"] for p in row["Serious crime (by year)"]}
    assert serious[2023] == 24.0
    assert serious[2024] == 12.0


def test_transform_crime_headline_is_mean_of_per_year_bars(tmp_path):
    """The avg/yr headline must equal the average of the by-year chart bars, i.e.
    the simple mean of each year's annualised count -- NOT a month-weighted pooled
    rate. They diverge when years have uneven partial-month coverage."""
    crime_dir = tmp_path / "crime"
    jan23 = crime_dir / "2023-01"
    jan24 = crime_dir / "2024-01"
    feb24 = crime_dir / "2024-02"
    for d in (jan23, jan24, feb24):
        d.mkdir(parents=True)

    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
    # 2023: 6 burglaries in 1 month -> 6 * 12 / 1 = 72/yr.
    (jan23 / "2023-01-test-force-street.csv").write_text(
        "\n".join(
            [header]
            + [
                f"{i},2023-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"
                for i in range(1, 7)
            ]
        )
        + "\n"
    )
    # 2024: 2 burglaries across 2 months -> 2 * 12 / 2 = 12/yr.
    (jan24 / "2024-01-test-force-street.csv").write_text(
        "\n".join([header, "7,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
    )
    (feb24 / "2024-02-test-force-street.csv").write_text(
        "\n".join([header, "8,2024-02,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,"]) + "\n"
    )

    output = tmp_path / "crime.parquet"
    by_year_output = tmp_path / "crime_by_year.parquet"
    transform_crime(crime_dir, output, by_year_output)

    # Mean of per-year bars = (72 + 12) / 2 = 42.0.
    # The old pooled rate (8 incidents / 3 months * 12 = 32.0) would be wrong.
    avg = pl.read_parquet(output).to_dicts()[0]
    assert avg["Burglary (avg/yr)"] == 42.0

    by_year = pl.read_parquet(by_year_output).row(0, named=True)
    burglary = {p["year"]: p["count"] for p in by_year["Burglary (by year)"]}
    assert burglary == {2023: 72.0, 2024: 12.0}
    # Headline equals the mean of the bars it summarises.
    assert avg["Burglary (avg/yr)"] == sum(burglary.values()) / len(burglary)


def test_transform_crime_fails_without_valid_months(tmp_path):
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
    month_dir.mkdir(parents=True)
    (month_dir / "2024-01-test-force-street.csv").write_text(
        "\n".join(
            [
                "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
                "1,,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
            ]
        )
        + "\n"
    )

    output = tmp_path / "crime.parquet"

    try:
        transform_crime(crime_dir, output)
    except ValueError as exc:
        assert "No valid crime months" in str(exc)
    else:
        raise AssertionError("Expected ValueError")


def test_transform_crime_applies_lsoa_2011_to_2021_lookup(tmp_path):
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
    month_dir.mkdir(parents=True)

    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
    # E01000001 was split into two 2021 LSOAs; E01000099 is unchanged.
    (month_dir / "2024-01-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
                "2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
                "3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
            ]
        )
        + "\n"
    )

    lookup_path = tmp_path / "lookup.parquet"
    pl.DataFrame(
        {
            "lsoa11": ["E01000001", "E01000001", "E01000099"],
            "lsoa21": ["E01000050", "E01000051", "E01000099"],
        }
    ).write_parquet(lookup_path)

    output = tmp_path / "crime.parquet"
    by_year_output = tmp_path / "by_year.parquet"
    transform_crime(crime_dir, output, by_year_output, lookup_path)

    # Split LSOA: 2 burglaries split evenly → 1/yr each child, annualised to 12/yr each.
    avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
    assert avg == [
        {"LSOA code": "E01000050", "Burglary (avg/yr)": 12.0},
        {"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
        {"LSOA code": "E01000099", "Burglary (avg/yr)": 12.0},
    ]

    by_year = pl.read_parquet(by_year_output).sort("LSOA code").to_dicts()
    burglaries = {row["LSOA code"]: row["Burglary (by year)"] for row in by_year}
    assert burglaries["E01000050"] == [{"year": 2024, "count": 12.0}]
    assert burglaries["E01000051"] == [{"year": 2024, "count": 12.0}]
    assert burglaries["E01000099"] == [{"year": 2024, "count": 12.0}]


def test_transform_crime_sums_mixed_weights_within_a_target_lsoa(tmp_path):
    """Irregular (M:N) recodes can land rows with DIFFERENT `_weight`s in the
    same (lsoa21, year, type) group: here E01000050 receives 0.5-weighted
    incidents from split E01000001 alongside a 1.0-weighted incident from
    E01000099. The aggregation must sum per-incident weights; the old
    `_weight.first() * len` applied one row's weight to all three
    (nondeterministically 1.5 or 3.0 instead of 2.0)."""
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2024-01"
    month_dir.mkdir(parents=True)

    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
    (month_dir / "2024-01-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "1,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
                "2,2024-01,F,F,-0.1,51.5,X,E01000001,L,Burglary,U,",
                "3,2024-01,F,F,-0.1,51.5,X,E01000099,L,Burglary,U,",
            ]
        )
        + "\n"
    )

    lookup_path = tmp_path / "lookup.parquet"
    pl.DataFrame(
        {
            "lsoa11": ["E01000001", "E01000001", "E01000099"],
            "lsoa21": ["E01000050", "E01000051", "E01000050"],
        }
    ).write_parquet(lookup_path)

    output = tmp_path / "crime.parquet"
    by_year_output = tmp_path / "by_year.parquet"
    transform_crime(crime_dir, output, by_year_output, lookup_path)

    # E01000050: 0.5 + 0.5 + 1.0 = 2.0 incidents -> 24/yr annualised.
    # E01000051: 0.5 + 0.5 = 1.0 incident -> 12/yr.
    avg = pl.read_parquet(output).sort("LSOA code").to_dicts()
    assert avg == [
        {"LSOA code": "E01000050", "Burglary (avg/yr)": 24.0},
        {"LSOA code": "E01000051", "Burglary (avg/yr)": 12.0},
    ]


def test_transform_crime_maps_legacy_crime_types(tmp_path):
    """Pre-2014 police.uk type names are aliased to current equivalents instead
    of being dropped."""
    crime_dir = tmp_path / "crime"
    month_dir = crime_dir / "2013-01"
    month_dir.mkdir(parents=True)

    header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
    (month_dir / "2013-01-test-force-street.csv").write_text(
        "\n".join(
            [
                header,
                "1,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Violent crime,Under investigation,",
                "2,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Public disorder and weapons,Under investigation,",
                "3,2013-01,Test Force,Test Force,-0.1,51.5,On or near X,E01000001,L,Burglary,Under investigation,",
            ]
        )
        + "\n"
    )

    output = tmp_path / "crime.parquet"
    by_year_output = tmp_path / "crime_by_year.parquet"
    transform_crime(crime_dir, output, by_year_output)

    row = pl.read_parquet(output).to_dicts()[0]
    # Single month -> annualised x12. Legacy names mapped to current columns.
    assert row["Violence and sexual offences (avg/yr)"] == 12.0
    assert row["Public order (avg/yr)"] == 12.0
    assert row["Burglary (avg/yr)"] == 12.0
    # The legacy names must NOT survive as their own columns.
    assert "Violent crime (avg/yr)" not in row
    assert "Public disorder and weapons (avg/yr)" not in row

    by_year = pl.read_parquet(by_year_output).row(0, named=True)
    serious = {p["year"]: p["count"] for p in by_year["Serious crime (by year)"]}
    # Serious = Violence and sexual offences (12) + Burglary (12) = 24
    assert serious[2013] == 24.0
    minor = {p["year"]: p["count"] for p in by_year["Minor crime (by year)"]}
    assert minor[2013] == 12.0  # Public order