from __future__ import annotations import json import zipfile import polars as pl from pipeline.validate_outputs import main def polygon(offset=0): x = float(offset) return { "type": "Polygon", "coordinates": [ [(x, 0.0), (x + 0.001, 0.0), (x + 0.001, 0.001), (x, 0.001), (x, 0.0)] ], } def write_boundary(path, postcodes, geometries=None): units = path / "units" units.mkdir(parents=True) features = [ { "type": "Feature", "properties": {"postcodes": postcode}, "geometry": (geometries[index] if geometries else polygon(index)), } for index, postcode in enumerate(postcodes) ] (units / "AA1.geojson").write_text( json.dumps({"type": "FeatureCollection", "features": features}) ) def test_validates_parquet_file_and_zip(tmp_path, monkeypatch): parquet_path = tmp_path / "data.parquet" file_path = tmp_path / "plain.txt" zip_path = tmp_path / "archive.zip" pl.DataFrame({"value": [1]}).write_parquet(parquet_path) file_path.write_text("ok\n") with zipfile.ZipFile(zip_path, "w") as archive: archive.writestr("data.txt", "ok\n") monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--parquet", str(parquet_path), "--file", str(file_path), "--zip", str(zip_path), "--glob", f"{tmp_path}::*.txt", "--zip-glob", f"{tmp_path}::*.zip", ], ) assert main() == 0 def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys): empty_path = tmp_path / "empty.txt" empty_path.touch() monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--file", str(empty_path), "--parquet", str(tmp_path / "missing.parquet"), "--glob", f"{tmp_path}::*.csv", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "empty file" in stderr assert "missing" in stderr assert "no files matched" in stderr def test_validates_postcode_boundary_matches(tmp_path, monkeypatch): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 0 def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "missing boundaries" in stderr assert "boundary postcodes are absent" in stderr def test_rejects_invalid_postcode_boundary_features(tmp_path, monkeypatch, capsys): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" units = boundaries_path / "units" units.mkdir(parents=True) pl.DataFrame({"postcode": ["AA1 1AA"]}).write_parquet(postcodes_path) bowtie = { "type": "Polygon", "coordinates": [[(0, 0), (1, 1), (1, 0), (0, 1), (0, 0)]], } features = [ { "type": "Feature", "properties": {"postcodes": "AA1 1AA"}, "geometry": polygon(), }, { "type": "Feature", "properties": {"postcodes": "AA1 1AA"}, "geometry": polygon(1), }, {"type": "Feature", "properties": {}, "geometry": polygon(2)}, {"type": "Feature", "properties": {"postcodes": "AA1 1AB"}, "geometry": None}, {"type": "Feature", "properties": {"postcodes": "AA1 1AC"}, "geometry": bowtie}, ] (units / "AA1.geojson").write_text( json.dumps({"type": "FeatureCollection", "features": features}) ) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "duplicate boundary postcode features" in stderr assert "missing properties.postcodes" in stderr assert "missing or empty geometry" in stderr assert "invalid boundary geometries" in stderr def test_validates_active_english_arcgis_boundary_matches(tmp_path, monkeypatch): arcgis_path = tmp_path / "arcgis.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"], "ctry25cd": ["E92000001", "E92000001", "W92000004"], "doterm": [None, "2020-01-01", None], } ).write_parquet(arcgis_path) write_boundary(boundaries_path, ["AA1 1AA"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--active-postcode-boundary-match", f"{arcgis_path}::{boundaries_path}", ], ) assert main() == 0 def test_rejects_active_english_arcgis_boundary_mismatch(tmp_path, monkeypatch, capsys): arcgis_path = tmp_path / "arcgis.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame( { "pcds": ["AA1 1AA", "AA1 1AB", "CF1 1AA"], "ctry25cd": ["E92000001", "E92000001", "W92000004"], "doterm": [None, None, None], } ).write_parquet(arcgis_path) write_boundary(boundaries_path, ["AA1 1AA", "CF1 1AA"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--active-postcode-boundary-match", f"{arcgis_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "active English postcodes" in stderr assert "not active English postcodes" in stderr