import polars as pl from pipeline.transform.crime import find_street_crime_csvs, transform_crime def test_find_street_crime_csvs_ignores_archive_sidecars(tmp_path): crime_dir = tmp_path / "crime" month_dir = crime_dir / "2024-01" month_dir.mkdir(parents=True) street = month_dir / "2024-01-test-force-street.csv" street.touch() (month_dir / "2024-01-test-force-outcomes.csv").touch() (month_dir / "2024-01-test-force-stop-and-search.csv").touch() (crime_dir / "notes.csv").touch() csvs, ignored_count = find_street_crime_csvs(crime_dir) assert csvs == [street] assert ignored_count == 3 def test_transform_crime_reads_only_street_crime_csvs(tmp_path): crime_dir = tmp_path / "crime" month_dir = crime_dir / "2024-01" month_dir.mkdir(parents=True) (month_dir / "2024-01-test-force-street.csv").write_text( "\n".join( [ "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context", "1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,", "2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,", "3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,,No LSOA,Robbery,Under investigation,", ] ) + "\n" ) (month_dir / "2024-01-test-force-outcomes.csv").write_text( "Crime ID,Month,Reported by,Outcome type\n1,2024-01,Test Force,Charged\n" ) output = tmp_path / "crime.parquet" transform_crime(crime_dir, output) result = pl.read_parquet(output).to_dicts() assert result == [{"LSOA code": "E01000001", "Burglary (avg/yr)": 24.0}] def test_transform_crime_annualises_over_all_valid_months(tmp_path): crime_dir = tmp_path / "crime" jan_dir = crime_dir / "2024-01" feb_dir = crime_dir / "2024-02" jan_dir.mkdir(parents=True) feb_dir.mkdir(parents=True) header = "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context" (jan_dir / "2024-01-test-force-street.csv").write_text( "\n".join( [ header, "1,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,", "2,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,", "3,2024-01,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,", ] ) + "\n" ) (feb_dir / "2024-02-test-force-street.csv").write_text( "\n".join( [ header, "4,2024-02,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000002,Other LSOA,Robbery,Under investigation,", ] ) + "\n" ) output = tmp_path / "crime.parquet" transform_crime(crime_dir, output) result = pl.read_parquet(output).sort("LSOA code").to_dicts() assert result == [ { "LSOA code": "E01000001", "Burglary (avg/yr)": 12.0, "Robbery (avg/yr)": 0.0, }, { "LSOA code": "E01000002", "Burglary (avg/yr)": 0.0, "Robbery (avg/yr)": 12.0, }, ] def test_transform_crime_fails_without_valid_months(tmp_path): crime_dir = tmp_path / "crime" month_dir = crime_dir / "2024-01" month_dir.mkdir(parents=True) (month_dir / "2024-01-test-force-street.csv").write_text( "\n".join( [ "Crime ID,Month,Reported by,Falls within,Longitude,Latitude,Location,LSOA code,LSOA name,Crime type,Last outcome category,Context", "1,,Test Force,Test Force,-0.1,51.5,On or near Test Street,E01000001,Test LSOA,Burglary,Under investigation,", ] ) + "\n" ) output = tmp_path / "crime.parquet" try: transform_crime(crime_dir, output) except ValueError as exc: assert "No valid crime months" in str(exc) else: raise AssertionError("Expected ValueError")