Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
err = capsys.readouterr().err
assert "Cyber fraud" in err
assert "WARNING" in err
def test_legacy_crime_types_are_mapped(tmp_path):
"""Pre-2014 crime-type names are aliased to current equivalents in the
spatial transform instead of being dropped as unknown types."""
units = tmp_path / "units"
_write_boundaries(
units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
)
crime = tmp_path / "crime"
_write_month(
crime,
"2013-01",
[
_crime_row("2013-01", 1005, 1005, "Violent crime"),
_crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
],
)
output = tmp_path / "crime_by_postcode.parquet"
by_year = tmp_path / "crime_by_postcode_by_year.parquet"
transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
row = pl.read_parquet(output).to_dicts()[0]
# Single postcode -> area-norm factor 1.0; single month/year -> x12.
assert row["Violence and sexual offences (avg/yr)"] == 12.0
assert row["Public order (avg/yr)"] == 12.0
by_year_row = pl.read_parquet(by_year).row(0, named=True)
assert by_year_row["Violence and sexual offences (by year)"] == [
{"year": 2013, "count": 12.0}
]
assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]