perfect-postcode/pipeline/transform/test_crime.py
2026-05-14 08:09:19 +01:00

119 lines
4.3 KiB
Python

import polars as pl
from pipeline.transform.crime import find_street_crime_csvs, transform_crime
def test_find_street_crime_csvs_ignores_archive_sidecars(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
street = month_dir / "2024-01-test-force-street.csv"
street.touch()
(month_dir / "2024-01-test-force-outcomes.csv").touch()
(month_dir / "2024-01-test-force-stop-and-search.csv").touch()
(crime_dir / "notes.csv").touch()
csvs, ignored_count = find_street_crime_csvs(crime_dir)
assert csvs == [street]
assert ignored_count == 3
def test_transform_crime_reads_only_street_crime_csvs(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
(month_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
"1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,,No LSOA,Robbery,Under investigation,",
]
)
+ "\n"
)
(month_dir / "2024-01-test-force-outcomes.csv").write_text(
"Crime ID,Month,Reported by,Outcome type\n1,2024-01,Test Force,Charged\n"
)
output = tmp_path / "crime.parquet"
transform_crime(crime_dir, output)
result = pl.read_parquet(output).to_dicts()
assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 24.0}]
def test_transform_crime_annualises_over_all_valid_months(tmp_path):
crime_dir = tmp_path / "crime"
jan_dir = crime_dir / "2024-01"
feb_dir = crime_dir / "2024-02"
jan_dir.mkdir(parents=True)
feb_dir.mkdir(parents=True)
header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context"
(jan_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
header,
"1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
"3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
]
)
+ "\n"
)
(feb_dir / "2024-02-test-force-street.csv").write_text(
"\n".join(
[
header,
"4,2024-02,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
transform_crime(crime_dir, output)
result = pl.read_parquet(output).sort("LSOA code").to_dicts()
assert result == [
{
"LSOA code": "E01000001",
"Burglary (avg/yr)": 12.0,
"Robbery (avg/yr)": 0.0,
},
{
"LSOA code": "E01000002",
"Burglary (avg/yr)": 0.0,
"Robbery (avg/yr)": 12.0,
},
]
def test_transform_crime_fails_without_valid_months(tmp_path):
crime_dir = tmp_path / "crime"
month_dir = crime_dir / "2024-01"
month_dir.mkdir(parents=True)
(month_dir / "2024-01-test-force-street.csv").write_text(
"\n".join(
[
"Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context",
"1,,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,",
]
)
+ "\n"
)
output = tmp_path / "crime.parquet"
try:
transform_crime(crime_dir, output)
except ValueError as exc:
assert "No valid crime months" in str(exc)
else:
raise AssertionError("Expected ValueError")