from __future__ import annotations import zipfile import json import polars as pl from pipeline.validate_outputs import main def write_boundary(path, postcodes): units = path / "units" units.mkdir(parents=True) features = [ {"type": "Feature", "properties": {"postcodes": postcode}, "geometry": None} for postcode in postcodes ] (units / "AA1.geojson").write_text( json.dumps({"type": "FeatureCollection", "features": features}) ) def test_validates_parquet_file_and_zip(tmp_path, monkeypatch): parquet_path = tmp_path / "data.parquet" file_path = tmp_path / "plain.txt" zip_path = tmp_path / "archive.zip" pl.DataFrame({"value": [1]}).write_parquet(parquet_path) file_path.write_text("ok\n") with zipfile.ZipFile(zip_path, "w") as archive: archive.writestr("data.txt", "ok\n") monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--parquet", str(parquet_path), "--file", str(file_path), "--zip", str(zip_path), "--glob", f"{tmp_path}::*.txt", "--zip-glob", f"{tmp_path}::*.zip", ], ) assert main() == 0 def test_rejects_missing_and_empty_outputs(tmp_path, monkeypatch, capsys): empty_path = tmp_path / "empty.txt" empty_path.touch() monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--file", str(empty_path), "--parquet", str(tmp_path / "missing.parquet"), "--glob", f"{tmp_path}::*.csv", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "empty file" in stderr assert "missing" in stderr assert "no files matched" in stderr def test_validates_postcode_boundary_matches(tmp_path, monkeypatch): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AB"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 0 def test_rejects_postcode_boundary_mismatch(tmp_path, monkeypatch, capsys): postcodes_path = tmp_path / "postcodes.parquet" boundaries_path = tmp_path / "postcode_boundaries" pl.DataFrame({"postcode": ["AA1 1AA", "AA1 1AB"]}).write_parquet(postcodes_path) write_boundary(boundaries_path, ["AA1 1AA", "AA1 1AC"]) monkeypatch.setattr( "sys.argv", [ "validate_outputs", "--postcode-boundary-match", f"{postcodes_path}::{boundaries_path}", ], ) assert main() == 1 stderr = capsys.readouterr().err assert "missing boundaries" in stderr assert "boundary postcodes are absent" in stderr